1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 20:57:06 +01:00

Large change with KNL preparation

This commit is contained in:
paboyle
2016-06-03 03:24:26 -07:00
parent 1c0e922585
commit 139cc5f1ae
26 changed files with 1810 additions and 1705 deletions

View File

@ -288,40 +288,20 @@ PARALLEL_FOR_LOOP
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
DhopInternalCommsThenCompute(st,U,in,out,dag);
}
template<class Impl>
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) {
assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag);
st.HaloExchange(in,compressor);
if ( dag == DaggerYes ) {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
}
};

View File

@ -114,9 +114,6 @@ namespace Grid {
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ;
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ;
// Constructor
WilsonFermion(GaugeField &_Umu,
GridCartesian &Fgrid,

View File

@ -1,5 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -39,8 +38,6 @@ namespace QCD {
// S-direction is INNERMOST and takes no part in the parity.
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
int WilsonFermion5DStatic::HandOptDslash;
int WilsonFermion5DStatic::AsmOptDslash;
// 5d lattice for DWF.
template<class Impl>
@ -300,6 +297,8 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
return;
#if 0
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
@ -320,6 +319,7 @@ void WilsonFermion5D<Impl>::Report(void)
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
#endif
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
@ -342,25 +342,14 @@ template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
int LLs = in._grid->_rdimensions[0];
commtime -=usecond();
// auto handle = st.HaloExchangeBegin(in,compressor);
// st.HaloExchangeComplete(handle);
st.HaloExchange(in,compressor);
commtime +=usecond();
@ -368,59 +357,24 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
jointime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
dslashtime -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF=s+LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
if( this->AsmOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
#pragma omp parallel for schedule(runtime)
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=lo.Reorder(ss);
int sF=LLs*sU;
for(int s=0;s<LLs;s++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
sF++;
}
}
}
@ -473,7 +427,7 @@ FermOpTemplateInstantiate(WilsonFermion5D);
GparityFermOpTemplateInstantiate(WilsonFermion5D);
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
}}

View File

@ -49,8 +49,6 @@ namespace Grid {
class WilsonFermion5DStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static int AsmOptDslash; // these are a temporary hack
static int HandOptDslash; // these are a temporary hack
static const std::vector<int> directions;
static const std::vector<int> displacements;
const int npoint = 8;
@ -122,13 +120,6 @@ namespace Grid {
FermionField &out,
int dag);
void DhopInternalCommsThenCompute(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
// Constructors
WilsonFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -31,14 +31,43 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
namespace QCD {
int WilsonKernelsStatic::HandOpt;
int WilsonKernelsStatic::AsmOpt;
template<class Impl>
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
// Need controls to do interior, exterior, or both
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,in,out);
else if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
}
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
// No asm implementation yet.
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
// else
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
}
////////////////////////////////////////////
// Generic implementation; move to different file?
////////////////////////////////////////////
template<class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
@ -214,9 +243,9 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
// Need controls to do interior, exterior, or both
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
@ -518,17 +547,9 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
vstream(out._odata[sF],result);
}
#if ( ! defined(AVX512) )
template<class Impl>
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
}
#endif
FermOpTemplateInstantiate(WilsonKernels);
template class WilsonKernels<DomainWallRedBlack5dImplF>;
template class WilsonKernels<DomainWallRedBlack5dImplD>;

View File

@ -38,14 +38,21 @@ namespace Grid {
// Helper routines that implement Wilson stencil for a single site.
// Common to both the WilsonFermion and WilsonFermion5D
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class WilsonKernelsStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static int AsmOpt; // these are a temporary hack
static int HandOpt; // these are a temporary hack
};
template<class Impl> class WilsonKernels : public FermionOperator<Impl> {
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
public:
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
@ -58,15 +65,26 @@ namespace Grid {
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
private:
// Specialised variants
void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in,FermionField &out);
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);

View File

@ -2,6 +2,8 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
Copyright (C) 2015
@ -26,237 +28,75 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#if defined(AVX512)
//#if defined (IMCI)
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h>
namespace Grid {
namespace QCD {
///////////////////////////////////////////////////////////
// Default to no assembler implementation
///////////////////////////////////////////////////////////
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
uint64_t now;
uint64_t first ;
int offset,local,perm, ptype;
const SiteHalfSpinor *pbuf = & buf[0];
const SiteSpinor *plocal = & in._odata[0];
void *pf;
int osites = in._grid->oSites();
StencilEntry *SE;
//#define STAMP(i) timers[i] = cyclecount() ;
#define STAMP(i) //timers[i] = cyclecount() ;
MASK_REGS;
first = cyclecount();
SE=st.GetEntry(ptype,Xm,ss);
// Xm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Ym,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFXM(Xm,pf);
}
XP_RECON;
// Ym
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Zm,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFYM(Ym,pf);
}
YP_RECON_ACCUM;
// Zm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Tm,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFZM(Zm,pf);
}
ZP_RECON_ACCUM;
// Tm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
SE=st.GetEntry(ptype,Tp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
TP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFTM(Tm,pf);
}
TP_RECON_ACCUM;
// Tp
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Zp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
TM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFTP(Tp,pf);
}
TM_RECON_ACCUM;
// Zp
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Yp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFZP(Zp,pf);
}
ZM_RECON_ACCUM;
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Xp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFYP(Yp,pf);
}
YM_RECON_ACCUM;
// Xp
perm = SE->_permute;
offset = SE->_offset;
local = SE->_is_local;
// Prefetch
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFXP(Xp,pf);
}
XM_RECON_ACCUM;
debug:
SAVE_RESULT(&out._odata[ss]);
assert(0);
}
template class WilsonKernels<WilsonImplF>;
template class WilsonKernels<WilsonImplD>;
template class WilsonKernels<GparityWilsonImplF>;
template class WilsonKernels<GparityWilsonImplD>;
template class WilsonKernels<DomainWallRedBlack5dImplF>;
template class WilsonKernels<DomainWallRedBlack5dImplD>;
}}
#if defined(AVX512)
///////////////////////////////////////////////////////////
// If we are AVX512 specialise the single precision routine
///////////////////////////////////////////////////////////
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h>
static Vector<vComplexF> signs;
int setupSigns(void ){
Vector<vComplexF> bother(2);
signs = bother;
vrsign(signs[0]);
visign(signs[1]);
return 1;
}
static int signInit = setupSigns();
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr)
template<>
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef VMOVIDUP
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#define MAYBEPERM(A,B)
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
template<>
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#endif
template class WilsonKernels<WilsonImplF>;
template class WilsonKernels<WilsonImplD>;
template class WilsonKernels<GparityWilsonImplF>;
template class WilsonKernels<GparityWilsonImplD>;
template class WilsonKernels<DomainWallRedBlack5dImplF>;
template class WilsonKernels<DomainWallRedBlack5dImplD>;
}}

View File

@ -0,0 +1,154 @@
{
int locala,perma, ptypea;
int localb,permb, ptypeb;
uint64_t basea, baseb;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0];
MASK_REGS;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns);
XM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXP(Xp,baseb);
}
LOAD64(%r10,isigns);
XM_RECON;
////////////////////////////////
// Yp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYP(Yp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM;
////////////////////////////////
// Zp
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZP(Zp,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM;
////////////////////////////////
// Tp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTP(Tp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM;
////////////////////////////////
// Xm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXM(Xm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM;
////////////////////////////////
// Ym
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYM(Ym,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_RECON_ACCUM;
////////////////////////////////
// Zm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZM(Zm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_RECON_ACCUM;
////////////////////////////////
// Tm
////////////////////////////////
basea = (uint64_t)&out._odata[ss];
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTM(Tm,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;
SAVE_RESULT(&out._odata[ss]);
}

View File

@ -312,7 +312,7 @@ namespace QCD {
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
@ -552,12 +552,10 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
vstream(ref()(3)(1),result_31);
vstream(ref()(3)(2),result_32);
}
return 0;
}
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
@ -798,7 +796,6 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
vstream(ref()(3)(1),result_31);
vstream(ref()(3)(2),result_32);
}
return 0;
}
@ -806,125 +803,80 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
// Specialise Gparity to simple implementation
////////////////////////////////////////////////
template<>
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
//////////////
/*
template<>
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
return 0;
}
////////////// Wilson ; uses this implementation /////////////////////
// Need Nc=3 though //
template<>
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
*/
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);