mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
simd in 5th dimension support
This commit is contained in:
parent
806a83d38b
commit
9b6ab6db16
@ -288,12 +288,8 @@ PARALLEL_FOR_LOOP
|
|||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( Impl::overlapCommsCompute () ) {
|
|
||||||
DhopInternalCommsOverlapCompute(st,U,in,out,dag);
|
|
||||||
} else {
|
|
||||||
DhopInternalCommsThenCompute(st,U,in,out,dag);
|
DhopInternalCommsThenCompute(st,U,in,out,dag);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
@ -331,15 +327,6 @@ PARALLEL_FOR_LOOP
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
|
||||||
|
|
||||||
assert(0);
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonFermion);
|
FermOpTemplateInstantiate(WilsonFermion);
|
||||||
GparityFermOpTemplateInstantiate(WilsonFermion);
|
GparityFermOpTemplateInstantiate(WilsonFermion);
|
||||||
|
|
||||||
|
@ -116,9 +116,6 @@ namespace Grid {
|
|||||||
|
|
||||||
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
const FermionField &in, FermionField &out,int dag) ;
|
||||||
void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
|
||||||
|
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu,
|
WilsonFermion(GaugeField &_Umu,
|
||||||
|
@ -68,10 +68,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
// some assertions
|
// some assertions
|
||||||
assert(FiveDimGrid._ndimension==5);
|
assert(FiveDimGrid._ndimension==5);
|
||||||
assert(FourDimGrid._ndimension==4);
|
assert(FourDimGrid._ndimension==4);
|
||||||
|
|
||||||
assert(FiveDimRedBlackGrid._ndimension==5);
|
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||||
assert(FourDimRedBlackGrid._ndimension==4);
|
assert(FourDimRedBlackGrid._ndimension==4);
|
||||||
|
|
||||||
assert(FiveDimRedBlackGrid._checker_dim==1);
|
assert(FiveDimRedBlackGrid._checker_dim==1);
|
||||||
|
|
||||||
// Dimension zero of the five-d is the Ls direction
|
// Dimension zero of the five-d is the Ls direction
|
||||||
@ -106,6 +104,70 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
dslashtime=0;
|
dslashtime=0;
|
||||||
dslash1time=0;
|
dslash1time=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
RealD _M5,const ImplParams &p) :
|
||||||
|
Kernels(p),
|
||||||
|
_FiveDimGrid (&FiveDimGrid),
|
||||||
|
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
|
||||||
|
_FourDimGrid (&FourDimGrid),
|
||||||
|
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
|
||||||
|
Stencil (_FiveDimGrid,npoint,Even,directions,displacements),
|
||||||
|
StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
|
||||||
|
StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||||
|
M5(_M5),
|
||||||
|
Umu(_FourDimGrid),
|
||||||
|
UmuEven(_FourDimRedBlackGrid),
|
||||||
|
UmuOdd (_FourDimRedBlackGrid),
|
||||||
|
Lebesgue(_FourDimGrid),
|
||||||
|
LebesgueEvenOdd(_FourDimRedBlackGrid)
|
||||||
|
{
|
||||||
|
int nsimd = Simd::Nsimd();
|
||||||
|
|
||||||
|
// some assertions
|
||||||
|
assert(FiveDimGrid._ndimension==5);
|
||||||
|
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||||
|
assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
|
||||||
|
assert(FourDimGrid._ndimension==4);
|
||||||
|
assert(FourDimRedBlackGrid._ndimension==4);
|
||||||
|
|
||||||
|
// Dimension zero of the five-d is the Ls direction
|
||||||
|
Ls=FiveDimGrid._fdimensions[0];
|
||||||
|
assert(FiveDimGrid._processors[0] ==1);
|
||||||
|
assert(FiveDimGrid._simd_layout[0] ==nsimd);
|
||||||
|
|
||||||
|
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
|
||||||
|
assert(FiveDimRedBlackGrid._processors[0] ==1);
|
||||||
|
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
|
||||||
|
|
||||||
|
// Other dimensions must match the decomposition of the four-D fields
|
||||||
|
for(int d=0;d<4;d++){
|
||||||
|
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
|
||||||
|
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
|
||||||
|
|
||||||
|
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
|
||||||
|
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||||
|
|
||||||
|
assert(FourDimGrid._simd_layout[d]=1);
|
||||||
|
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
|
||||||
|
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
|
||||||
|
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
||||||
|
|
||||||
|
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
|
||||||
|
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||||
|
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate the required comms buffer
|
||||||
|
ImportGauge(_Umu);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||||
{
|
{
|
||||||
@ -294,11 +356,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
|||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||||
|
int LLs = in._grid->_rdimensions[0];
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
int HT = GridThread::GetHyperThreads();
|
|
||||||
int cores = GridThread::GetCores();
|
|
||||||
int nwork = U._grid->oSites();
|
|
||||||
|
|
||||||
commtime -=usecond();
|
commtime -=usecond();
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||||
@ -318,97 +376,48 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
|||||||
if( this->HandOptDslash ) {
|
if( this->HandOptDslash ) {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
int sF = s+LLs*sU;
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
{
|
for(int s=0;s<LLs;s++){
|
||||||
int sd;
|
|
||||||
for(sd=0;sd<Ls;sd++){
|
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = sd+Ls*sU;
|
int sF = s+LLs*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
if( this->AsmOptDslash ) {
|
if( this->AsmOptDslash ) {
|
||||||
// for(int i=0;i<1;i++){
|
|
||||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
|
||||||
// PerformanceCounter Counter(i);
|
|
||||||
// Counter.Start();
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for(int t=0;t<threads;t++){
|
|
||||||
|
|
||||||
int hyperthread = t%HT;
|
|
||||||
int core = t/HT;
|
|
||||||
|
|
||||||
int sswork, swork,soff,ssoff, sU,sF;
|
|
||||||
|
|
||||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
|
||||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
|
||||||
|
|
||||||
for(int ss=0;ss<sswork;ss++){
|
|
||||||
for(int s=soff;s<soff+swork;s++){
|
|
||||||
|
|
||||||
sU=ss+ ssoff;
|
|
||||||
|
|
||||||
if ( LebesgueOrder::UseLebesgueOrder ) {
|
|
||||||
sU = lo.Reorder(sU);
|
|
||||||
}
|
|
||||||
sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Counter.Stop();
|
|
||||||
// Counter.Report();
|
|
||||||
// }
|
|
||||||
} else if( this->HandOptDslash ) {
|
|
||||||
/*
|
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for(int t=0;t<threads;t++){
|
|
||||||
|
|
||||||
int hyperthread = t%HT;
|
|
||||||
int core = t/HT;
|
|
||||||
|
|
||||||
int sswork, swork,soff,ssoff, sU,sF;
|
|
||||||
|
|
||||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
|
||||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
|
||||||
|
|
||||||
for(int ss=0;ss<sswork;ss++){
|
|
||||||
sU=ss+ ssoff;
|
|
||||||
for(int s=soff;s<soff+swork;s++){
|
|
||||||
sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
int sF = s+LLs*sU;
|
||||||
int sF = s+Ls*sU;
|
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if( this->HandOptDslash ) {
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sU=ss;
|
||||||
|
int sF = s+LLs*sU;
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
int sF = s+LLs*sU;
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -418,251 +427,6 @@ PARALLEL_FOR_LOOP
|
|||||||
alltime+=usecond();
|
alltime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
|
||||||
alltime-=usecond();
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
int HT = GridThread::GetHyperThreads();
|
|
||||||
int cores = GridThread::GetCores();
|
|
||||||
int nwork = U._grid->oSites();
|
|
||||||
|
|
||||||
commtime -=usecond();
|
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
|
||||||
st.HaloExchangeComplete(handle);
|
|
||||||
commtime +=usecond();
|
|
||||||
|
|
||||||
jointime -=usecond();
|
|
||||||
jointime +=usecond();
|
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
|
||||||
// Not loop ordering and data layout.
|
|
||||||
// Designed to create
|
|
||||||
// - per thread reuse in L1 cache for U
|
|
||||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
|
||||||
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
for(int jjj=0;jjj<100;jjj++){
|
|
||||||
#pragma omp barrier
|
|
||||||
dslashtime -=usecond();
|
|
||||||
if ( dag == DaggerYes ) {
|
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
#pragma omp for
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
|
|
||||||
#pragma omp for
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
{
|
|
||||||
int sd;
|
|
||||||
for(sd=0;sd<Ls;sd++){
|
|
||||||
int sU=ss;
|
|
||||||
int sF = sd+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if( this->AsmOptDslash ) {
|
|
||||||
// for(int i=0;i<1;i++){
|
|
||||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
|
||||||
// PerformanceCounter Counter(i);
|
|
||||||
// Counter.Start();
|
|
||||||
|
|
||||||
#pragma omp for
|
|
||||||
for(int t=0;t<threads;t++){
|
|
||||||
|
|
||||||
int hyperthread = t%HT;
|
|
||||||
int core = t/HT;
|
|
||||||
|
|
||||||
int sswork, swork,soff,ssoff, sU,sF;
|
|
||||||
|
|
||||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
|
||||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
|
||||||
|
|
||||||
for(int ss=0;ss<sswork;ss++){
|
|
||||||
for(int s=soff;s<soff+swork;s++){
|
|
||||||
|
|
||||||
sU=ss+ ssoff;
|
|
||||||
|
|
||||||
if ( LebesgueOrder::UseLebesgueOrder ) {
|
|
||||||
sU = lo.Reorder(sU);
|
|
||||||
}
|
|
||||||
sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Counter.Stop();
|
|
||||||
// Counter.Report();
|
|
||||||
// }
|
|
||||||
} else if( this->HandOptDslash ) {
|
|
||||||
#pragma omp for
|
|
||||||
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#pragma omp for
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dslashtime +=usecond();
|
|
||||||
alltime+=usecond();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
|
||||||
alltime-=usecond();
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
int HT = GridThread::GetHyperThreads();
|
|
||||||
int cores = GridThread::GetCores();
|
|
||||||
int nwork = U._grid->oSites();
|
|
||||||
|
|
||||||
commtime -=usecond();
|
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
|
||||||
st.HaloExchangeComplete(handle);
|
|
||||||
commtime +=usecond();
|
|
||||||
|
|
||||||
jointime -=usecond();
|
|
||||||
jointime +=usecond();
|
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
|
||||||
// Not loop ordering and data layout.
|
|
||||||
// Designed to create
|
|
||||||
// - per thread reuse in L1 cache for U
|
|
||||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
|
||||||
|
|
||||||
#pragma omp parallel
|
|
||||||
{
|
|
||||||
for(int jjj=0;jjj<100;jjj++){
|
|
||||||
#pragma omp barrier
|
|
||||||
dslashtime -=usecond();
|
|
||||||
if ( dag == DaggerYes ) {
|
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
#pragma omp for
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=0;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
|
|
||||||
#pragma omp for
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
{
|
|
||||||
int sd;
|
|
||||||
for(sd=0;sd<Ls;sd++){
|
|
||||||
int sU=0;
|
|
||||||
int sF = sd+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if( this->AsmOptDslash ) {
|
|
||||||
// for(int i=0;i<1;i++){
|
|
||||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
|
||||||
// PerformanceCounter Counter(i);
|
|
||||||
// Counter.Start();
|
|
||||||
|
|
||||||
#pragma omp for
|
|
||||||
for(int t=0;t<threads;t++){
|
|
||||||
|
|
||||||
int hyperthread = t%HT;
|
|
||||||
int core = t/HT;
|
|
||||||
|
|
||||||
int sswork, swork,soff,ssoff, sU,sF;
|
|
||||||
|
|
||||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
|
||||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
|
||||||
|
|
||||||
for(int ss=0;ss<sswork;ss++){
|
|
||||||
for(int s=soff;s<soff+swork;s++){
|
|
||||||
|
|
||||||
sU=0;
|
|
||||||
sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Counter.Stop();
|
|
||||||
// Counter.Report();
|
|
||||||
// }
|
|
||||||
} else if( this->HandOptDslash ) {
|
|
||||||
#pragma omp for
|
|
||||||
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=0;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
#pragma omp for
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=0;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dslashtime +=usecond();
|
|
||||||
alltime+=usecond();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -706,6 +470,8 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
|
|||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonFermion5D);
|
FermOpTemplateInstantiate(WilsonFermion5D);
|
||||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
||||||
|
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
|
||||||
|
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
@ -87,6 +87,7 @@ namespace Grid {
|
|||||||
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
|
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
|
|
||||||
// These can be overridden by fancy 5d chiral action
|
// These can be overridden by fancy 5d chiral action
|
||||||
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
||||||
@ -121,32 +122,12 @@ namespace Grid {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOMPbench(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
void DhopInternalL1bench(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
void DhopInternalCommsThenCompute(StencilImpl & st,
|
void DhopInternalCommsThenCompute(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
void DhopInternalCommsOverlapCompute(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
WilsonFermion5D(GaugeField &_Umu,
|
WilsonFermion5D(GaugeField &_Umu,
|
||||||
@ -156,6 +137,15 @@ namespace Grid {
|
|||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
double _M5,const ImplParams &p= ImplParams());
|
double _M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
|
// Constructors
|
||||||
|
WilsonFermion5D(int simd,
|
||||||
|
GaugeField &_Umu,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
double _M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
// DoubleStore
|
// DoubleStore
|
||||||
void ImportGauge(const GaugeField &_Umu);
|
void ImportGauge(const GaugeField &_Umu);
|
||||||
|
|
||||||
|
@ -529,5 +529,7 @@ void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonKernels);
|
FermOpTemplateInstantiate(WilsonKernels);
|
||||||
|
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
||||||
|
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -256,5 +256,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
|||||||
template class WilsonKernels<WilsonImplD>;
|
template class WilsonKernels<WilsonImplD>;
|
||||||
template class WilsonKernels<GparityWilsonImplF>;
|
template class WilsonKernels<GparityWilsonImplF>;
|
||||||
template class WilsonKernels<GparityWilsonImplD>;
|
template class WilsonKernels<GparityWilsonImplD>;
|
||||||
|
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
||||||
|
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
||||||
}}
|
}}
|
||||||
#endif
|
#endif
|
||||||
|
@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
Chi_11 = ref()(1)(1);\
|
Chi_11 = ref()(1)(1);\
|
||||||
Chi_12 = ref()(1)(2);
|
Chi_12 = ref()(1)(2);
|
||||||
|
|
||||||
|
// To splat or not to splat depends on the implementation
|
||||||
#define MULT_2SPIN(A)\
|
#define MULT_2SPIN(A)\
|
||||||
auto & ref(U._odata[sU](A)); \
|
auto & ref(U._odata[sU](A)); \
|
||||||
U_00 = ref()(0,0);\
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
U_10 = ref()(1,0);\
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
U_20 = ref()(2,0);\
|
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||||
U_01 = ref()(0,1);\
|
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||||
U_11 = ref()(1,1); \
|
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||||
U_21 = ref()(2,1);\
|
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||||
UChi_00 = U_00*Chi_00;\
|
UChi_00 = U_00*Chi_00;\
|
||||||
UChi_10 = U_00*Chi_10;\
|
UChi_10 = U_00*Chi_10;\
|
||||||
UChi_01 = U_10*Chi_00;\
|
UChi_01 = U_10*Chi_00;\
|
||||||
@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi_11+= U_11*Chi_11;\
|
UChi_11+= U_11*Chi_11;\
|
||||||
UChi_02+= U_21*Chi_01;\
|
UChi_02+= U_21*Chi_01;\
|
||||||
UChi_12+= U_21*Chi_11;\
|
UChi_12+= U_21*Chi_11;\
|
||||||
U_00 = ref()(0,2);\
|
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||||
U_10 = ref()(1,2);\
|
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||||
U_20 = ref()(2,2);\
|
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||||
UChi_00+= U_00*Chi_02;\
|
UChi_00+= U_00*Chi_02;\
|
||||||
UChi_10+= U_00*Chi_12;\
|
UChi_10+= U_00*Chi_12;\
|
||||||
UChi_01+= U_10*Chi_02;\
|
UChi_01+= U_10*Chi_02;\
|
||||||
@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi_02+= U_20*Chi_02;\
|
UChi_02+= U_20*Chi_02;\
|
||||||
UChi_12+= U_20*Chi_12;
|
UChi_12+= U_20*Chi_12;
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_00,Chi_00);\
|
permute##dir(Chi_00,Chi_00);\
|
||||||
permute##dir(Chi_01,Chi_01);\
|
permute##dir(Chi_01,Chi_01);\
|
||||||
@ -809,7 +811,6 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
|
|||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
||||||
//check consistency of return types between these functions and the ones in WilsonKernels.cc
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -843,6 +844,47 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//////////////
|
||||||
|
/*
|
||||||
|
template<>
|
||||||
|
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
@ -870,4 +912,21 @@ template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilI
|
|||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
Loading…
Reference in New Issue
Block a user