1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 01:05:38 +01:00

Staggereed move to accelerator

This commit is contained in:
Peter Boyle 2020-05-28 08:33:06 -04:00
parent cf2938688a
commit 006cc8a8f1
18 changed files with 1319 additions and 356 deletions

View File

@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
NAMESPACE_CHECK(Staggered);
@ -282,6 +283,10 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

View File

@ -62,8 +62,8 @@ public:
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base

View File

@ -0,0 +1,194 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
#define GRID_QCD_NAIVE_STAG_FERMION_H
NAMESPACE_BEGIN(Grid);
class NaiveStaggeredFermionStatic {
public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 8;
};
template <class Impl>
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
RealD M(const FermionField &in, FermionField &out);
RealD Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out);
void MeooeDag(const FermionField &in, FermionField &out);
void Mooee(const FermionField &in, FermionField &out);
void MooeeDag(const FermionField &in, FermionField &out);
void MooeeInv(const FermionField &in, FermionField &out);
void MooeeInvDag(const FermionField &in, FermionField &out);
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop (const FermionField &in, FermionField &out, int dag);
void DhopOE(const FermionField &in, FermionField &out, int dag);
void DhopEO(const FermionField &in, FermionField &out, int dag);
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl &st,
DoubledGaugeField &U,
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
NaiveStaggeredFermion(GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGauge (const GaugeField &_U );
DoubledGaugeField &GetU(void) { return Umu ; } ;
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;
GridBase *_grid;
GridBase *_cbgrid;
// Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &srct,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
NAMESPACE_END(Grid);
#endif

View File

@ -47,23 +47,34 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
public:
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
public:
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
protected:
///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
template<int Naik>
void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik>
void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik>
void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
@ -71,15 +82,18 @@ public:
///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template<int Naik>
void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik>
void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
template<int Naik>
void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
@ -87,27 +101,10 @@ public:
///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
public:

View File

@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor;
int LLs = in.Grid()->_rdimensions[0];
@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DhopFaceTime-=usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
// Remove explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
DhopComputeTime-=usecond();
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U.Grid()->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopComputeTime+=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
DhopComputeTime2-=usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
});
} else {
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
});
DhopComputeTime2-=usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime2+=usecond();
#else
assert(0);
#endif
}
template<class Impl>
@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
Compressor compressor;
int LLs = in.Grid()->_rdimensions[0];
//double t1=usecond();
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
});
} else {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
});
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
}
/*CHANGE END*/

View File

@ -395,11 +395,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template <class Impl>
@ -409,7 +407,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
Compressor compressor;
int len = U.Grid()->oSites();
@ -418,60 +415,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
}
} else {
st.CommunicateThreaded();
}
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
@ -479,28 +446,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopComputeTime2 -= usecond();
{
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
} else {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
}
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
#else
assert(0);
#endif
}
@ -520,19 +470,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
DhopComputeTime -= usecond();
if (dag == DaggerYes) {
thread_for(sss, in.Grid()->oSites(),{
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
});
} else {
thread_for(sss, in.Grid()->oSites(),{
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
});
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();

View File

@ -0,0 +1,499 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////
// Constructor and gauge import
/////////////////////////////////
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid)
{
int vol4;
int LLs=1;
c1=_c1;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
{
ImportGauge(_U);
}
////////////////////////////////////////////////////////////
// Momentum space propagator should be
// https://arxiv.org/pdf/hep-lat/9712010.pdf
//
// mom space action.
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
//
// must track through staggered flavour/spin reduction in literature to
// turn to free propagator for the one component chi field, a la page 4/5
// of above link to implmement fourier based solver.
////////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
{
GaugeLinkField U(GaugeGrid());
DoubledGaugeField _UUU(GaugeGrid());
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
// Discard teh Naik as Naive
////////////////////////////////////////////////////////
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
////////////////////////////////////////////////////////
// Apply scale factors to get the right fermion Kinetic term
// Could pass coeffs into the double store to save work.
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
}
CopyGaugeCheckerboards();
}
/////////////////////////////
// Implement the interface
/////////////////////////////
template <class Impl>
RealD NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out);
}
template <class Impl>
RealD NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass);
out = scal * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out);
}
///////////////////////////////////
// Internal
///////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField & mat,
const FermionField &A, const FermionField &B, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor;
FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid());
Atilde = A;
st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) {
////////////////////////
// Call the single hop
////////////////////////
auto U_v = U.View(CpuRead);
auto B_v = B.View(CpuWrite);
auto Btilde_v = Btilde.View(CpuWrite);
thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
});
assert(0);// need to figure out the force interface with a blasted three link term.
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
mat.Checkerboard() = U.Checkerboard();
DerivInternal(Stencil, Umu, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Even);
assert(U.Checkerboard() == Odd);
mat.Checkerboard() = Odd;
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Odd);
assert(U.Checkerboard() == Even);
mat.Checkerboard() = Even;
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
{
DhopDir(in, out, dir, disp);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
{
assert(0); // Not implemented yet
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
{
Compressor compressor;
Stencil.HaloExchange(in, compressor);
auto Umu_v = Umu.View(CpuRead);
auto in_v = in.View(CpuRead);
auto out_v = out.View(CpuWrite);
// thread_for( sss, in.Grid()->oSites(),{
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
// });
assert(0);
};
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,lo,U,in,out,dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
Compressor compressor;
int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void NaiveStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
assert(0);
}
NAMESPACE_END(Grid);

View File

@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
assert(0);
@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
gauge2 =(uint64_t)&UU[sU]( Z ); \
gauge3 =(uint64_t)&UU[sU]( T );
// This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
#ifdef AVX512
@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
// for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
}
#include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
{
#ifdef AVX512
@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
// This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
#ifdef AVX512
@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1);
@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
}
#include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{
#ifdef AVX512
@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1);

View File

@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef typename Simd::scalar_type S;
@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
if (Naik) {
skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
}
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef typename Simd::scalar_type S;
@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
}
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) {
result()()(0) = - even_0 - odd_0;
@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag)
{
typedef typename Simd::scalar_type S;
@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
}
// Add sum of all exterior connected stencil legs
if ( nmu ) {
if ( dag ) {
@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
}
}
/*
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \
*/
#undef LOAD_CHI
NAMESPACE_END(Grid);

View File

@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
chi_p = &in[SE->_offset]; \
} \
} else { \
chi_p = &buf[SE->_offset]; \
@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
chi_p = &in[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U[sU], *chi_p, Dir); \
multLink(Uchi, U[sU], *chi_p, Dir); \
}
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
chi_p = &buf[SE->_offset]; \
multLink(Uchi, U[sU], *chi_p, Dir); \
multLink(Uchi, U[sU], *chi_p, Dir); \
}
template <class Impl>
@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag) {
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag)
{
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
int ptype;
int skew;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// for(int s=0;s<LLs;s++){
//
// int sF=LLs*sU+s;
{
skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) {
Uchi = - Uchi;
}
@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
// Only contributions from interior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
int ptype;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s;
{
skew = 0;
Uchi=Zero();
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) {
Uchi = - Uchi;
}
@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
// Only contributions from exterior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p;
// SiteSpinor chi;
@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
int nmu=0;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s;
{
skew = 0;
Uchi=Zero();
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( nmu ) {
if ( dag ) {
out[sF] = out[sF] - Uchi;
@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=1;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionFieldView &in, FermionFieldView &out,
int dag,int interior,int exterior)
{
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break;
#endif
case OptHandUnroll:
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
};
template <class Impl>
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
{
// Disp should be either +1,-1,+3,-3
// What about "dag" ?
@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
assert(0);
}
#define KERNEL_CALLNB(A,improved) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
#define ASM_CALL(A) \
const uint64_t NN = Nsite*Ls; \
thread_for( ss, NN, { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
auto UUU_v = UUU.View(AcceleratorRead);
auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
auto UUU_v= U.View(AcceleratorRead);
auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid);

View File

@ -496,5 +496,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
assert(0 && " Kernel optimisation case not covered ");
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid);

View File

@ -0,0 +1,36 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
NAMESPACE_BEGIN(Grid);
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
NAMESPACE_END(Grid);

View File

@ -0,0 +1,37 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class NaiveStaggeredFermion<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../NaiveStaggeredFermionInstantiation.cc.master

View File

@ -0,0 +1 @@
../NaiveStaggeredFermionInstantiation.cc.master

View File

@ -88,6 +88,7 @@ done
CC_LIST=" \
ImprovedStaggeredFermion5DInstantiation \
ImprovedStaggeredFermionInstantiation \
NaiveStaggeredFermionInstantiation \
StaggeredKernelsInstantiation "
for impl in $STAG_IMPL_LIST

View File

@ -87,26 +87,6 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
ref = Zero();
/*
{ // Naive wilson implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
// ref = src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x
tmp = U[mu]*Cshift(src,mu,1);
for(int i=0;i<ref._odata.size();i++){
ref[i]+= tmp[i] - Gamma(Gmu[mu])*tmp[i]; ;
}
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu,-1);
for(int i=0;i<ref._odata.size();i++){
ref[i]+= tmp[i] + Gamma(Gmu[mu])*tmp[i]; ;
}
}
}
ref = -0.5*ref;
*/
RealD mass=0.1;
RealD c1=9.0/8.0;
@ -125,10 +105,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Grid_finalize();
}

View File

@ -0,0 +1,282 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_wilson.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(&Grid);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid);
pRNG.SeedFixedIntegers(seeds);
// pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
typedef typename NaiveStaggeredFermionR::FermionField FermionField;
typedef typename NaiveStaggeredFermionR::ComplexField ComplexField;
typename NaiveStaggeredFermionR::ImplParams params;
FermionField src (&Grid); random(pRNG,src);
FermionField result(&Grid); result=Zero();
FermionField ref(&Grid); ref=Zero();
FermionField tmp(&Grid); tmp=Zero();
FermionField err(&Grid); tmp=Zero();
FermionField phi (&Grid); random(pRNG,phi);
FermionField chi (&Grid); random(pRNG,chi);
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
// Only one non-zero (y)
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
/* Debug force unit
U[mu] = 1.0;
PokeIndex<LorentzIndex>(Umu,U[mu],mu);
*/
}
ref = Zero();
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD u0=1.0;
{ // Simple improved staggered implementation
ref = Zero();
RealD c1tad = 0.5*c1/u0;
Lattice<iScalar<vInteger> > coor(&Grid);
Lattice<iScalar<vInteger> > x(&Grid); LatticeCoordinate(x,0);
Lattice<iScalar<vInteger> > y(&Grid); LatticeCoordinate(y,1);
Lattice<iScalar<vInteger> > z(&Grid); LatticeCoordinate(z,2);
Lattice<iScalar<vInteger> > t(&Grid); LatticeCoordinate(t,3);
Lattice<iScalar<vInteger> > lin_z(&Grid); lin_z=x+y;
Lattice<iScalar<vInteger> > lin_t(&Grid); lin_t=x+y+z;
for(int mu=0;mu<Nd;mu++){
// Staggered Phase.
ComplexField phases(&Grid); phases=1.0;
if ( mu == 1 ) phases = where( mod(x ,2)==(Integer)0, phases,-phases);
if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases);
if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases);
tmp = PeriodicBC::CovShiftForward(U[mu],mu,src);
ref = ref +c1tad*tmp*phases; // Forward 1 hop
tmp = PeriodicBC::CovShiftBackward(U[mu],mu,src);
ref = ref -c1tad*tmp*phases; // Backward 1 hop
}
// ref = ref + mass * src;
}
NaiveStaggeredFermionR Ds(Umu,Grid,RBGrid,mass,c1,u0,params);
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
std::cout<<GridLogMessage << "Calling Ds"<<std::endl;
int ncall=1000;
double t0=usecond();
for(int i=0;i<ncall;i++){
Ds.Dhop(src,result,0);
}
double t1=usecond();
double t2;
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Testing that Deo + Doe = Dunprec "<<std::endl;
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
FermionField src_e (&RBGrid);
FermionField src_o (&RBGrid);
FermionField r_e (&RBGrid);
FermionField r_o (&RBGrid);
FermionField r_eo (&Grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
Ds.Meooe(src_e,r_o); std::cout<<GridLogMessage<<"Applied Meo"<<std::endl;
Ds.Meooe(src_o,r_e); std::cout<<GridLogMessage<<"Applied Moe"<<std::endl;
Ds.Dhop (src,ref,DaggerNo);
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err= ref - r_eo;
std::cout<<GridLogMessage << "EO norm diff "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(r_eo) <<std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Test Ddagger is the dagger of D by requiring "<<std::endl;
std::cout<<GridLogMessage<<"= < phi | Deo | chi > * = < chi | Deo^dag| phi> "<<std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
FermionField chi_e (&RBGrid);
FermionField chi_o (&RBGrid);
FermionField dchi_e (&RBGrid);
FermionField dchi_o (&RBGrid);
FermionField phi_e (&RBGrid);
FermionField phi_o (&RBGrid);
FermionField dphi_e (&RBGrid);
FermionField dphi_o (&RBGrid);
pickCheckerboard(Even,chi_e,chi);
pickCheckerboard(Odd ,chi_o,chi);
pickCheckerboard(Even,phi_e,phi);
pickCheckerboard(Odd ,phi_o,phi);
Ds.Meooe(chi_e,dchi_o);
Ds.Meooe(chi_o,dchi_e);
Ds.MeooeDag(phi_e,dphi_o);
Ds.MeooeDag(phi_o,dphi_e);
ComplexD pDce = innerProduct(phi_e,dchi_e);
ComplexD pDco = innerProduct(phi_o,dchi_o);
ComplexD cDpe = innerProduct(chi_e,dphi_e);
ComplexD cDpo = innerProduct(chi_o,dphi_o);
std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Test MeeInv Mee = 1 "<<std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
pickCheckerboard(Even,chi_e,chi);
pickCheckerboard(Odd ,chi_o,chi);
Ds.Mooee(chi_e,src_e);
Ds.MooeeInv(src_e,phi_e);
Ds.Mooee(chi_o,src_o);
Ds.MooeeInv(src_o,phi_o);
setCheckerboard(phi,phi_e);
setCheckerboard(phi,phi_o);
err = phi-chi;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<< std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Test MeeInvDag MeeDag = 1 "<<std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
pickCheckerboard(Even,chi_e,chi);
pickCheckerboard(Odd ,chi_o,chi);
Ds.MooeeDag(chi_e,src_e);
Ds.MooeeInvDag(src_e,phi_e);
Ds.MooeeDag(chi_o,src_o);
Ds.MooeeInvDag(src_o,phi_o);
setCheckerboard(phi,phi_e);
setCheckerboard(phi,phi_o);
err = phi-chi;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<< std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
std::cout<<GridLogMessage<<"= Test MpcDagMpc is Hermitian "<<std::endl;
std::cout<<GridLogMessage<<"=============================================================="<<std::endl;
random(pRNG,phi);
random(pRNG,chi);
pickCheckerboard(Even,chi_e,chi);
pickCheckerboard(Odd ,chi_o,chi);
pickCheckerboard(Even,phi_e,phi);
pickCheckerboard(Odd ,phi_o,phi);
SchurDiagMooeeOperator<NaiveStaggeredFermionR,FermionField> HermOpEO(Ds);
HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2);
HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2);
HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2);
HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2);
pDce = innerProduct(phi_e,dchi_e);
pDco = innerProduct(phi_o,dchi_o);
cDpe = innerProduct(chi_e,dphi_e);
cDpo = innerProduct(chi_o,dphi_o);
std::cout<<GridLogMessage <<"e "<<pDce<<" "<<cDpe <<std::endl;
std::cout<<GridLogMessage <<"o "<<pDco<<" "<<cDpo <<std::endl;
std::cout<<GridLogMessage <<"pDce - conj(cDpo) "<< pDco-conj(cDpo) <<std::endl;
std::cout<<GridLogMessage <<"pDco - conj(cDpe) "<< pDce-conj(cDpe) <<std::endl;
Grid_finalize();
}