mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-03 18:55:56 +01:00
Overlap comms compute improvements in hand op kernels, and better timing from Edison and Cori
This commit is contained in:
parent
d19321dfde
commit
dafc74020c
@ -137,9 +137,6 @@
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
|
@ -111,6 +111,7 @@ namespace Grid {
|
||||
}
|
||||
|
||||
void Communicate(void ) {
|
||||
commtime-=usecond();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
_grid->SendToRecvFrom(Packets[i].send_buf,
|
||||
Packets[i].to_rank,
|
||||
@ -118,6 +119,7 @@ namespace Grid {
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes);
|
||||
}
|
||||
commtime+=usecond();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////
|
||||
@ -181,12 +183,10 @@ PARALLEL_FOR_LOOP
|
||||
/////////////////////////////////////////
|
||||
#define TIMING_HACK
|
||||
#ifdef TIMING_HACK
|
||||
double buftime;
|
||||
double jointime;
|
||||
double gathertime;
|
||||
double commtime;
|
||||
double commstime;
|
||||
double halotime;
|
||||
double scattertime;
|
||||
double halogtime;
|
||||
double mergetime;
|
||||
double gathermtime;
|
||||
double splicetime;
|
||||
@ -202,13 +202,11 @@ PARALLEL_FOR_LOOP
|
||||
{
|
||||
#ifdef TIMING_HACK
|
||||
gathertime=0;
|
||||
jointime=0;
|
||||
commtime=0;
|
||||
commstime=0;
|
||||
halotime=0;
|
||||
scattertime=0;
|
||||
halogtime=0;
|
||||
mergetime=0;
|
||||
gathermtime=0;
|
||||
buftime=0;
|
||||
splicetime=0;
|
||||
nosplicetime=0;
|
||||
#endif
|
||||
@ -514,7 +512,9 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
void HaloExchangeComplete(std::thread &thr)
|
||||
{
|
||||
jointime-=usecond();
|
||||
thr.join();
|
||||
jointime+=usecond();
|
||||
CommsMerge();
|
||||
}
|
||||
|
||||
@ -522,7 +522,7 @@ PARALLEL_FOR_LOOP
|
||||
{
|
||||
// conformable(source._grid,_grid);
|
||||
assert(source._grid==_grid);
|
||||
halotime-=usecond();
|
||||
halogtime-=usecond();
|
||||
|
||||
assert (comm_buf.size() == _unified_buffer_size );
|
||||
u_comm_offset=0;
|
||||
@ -582,7 +582,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
|
||||
assert(u_comm_offset==_unified_buffer_size);
|
||||
halotime+=usecond();
|
||||
halogtime+=usecond();
|
||||
}
|
||||
|
||||
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
|
||||
@ -633,7 +633,6 @@ PARALLEL_FOR_LOOP
|
||||
assert (recv_from_rank != _grid->ThisRank());
|
||||
|
||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||
commtime-=usecond();
|
||||
/*
|
||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
@ -647,8 +646,6 @@ PARALLEL_FOR_LOOP
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
commtime+=usecond();
|
||||
|
||||
u_comm_offset+=words;
|
||||
}
|
||||
}
|
||||
@ -657,7 +654,6 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
|
||||
{
|
||||
buftime-=usecond();
|
||||
const int Nsimd = _grid->Nsimd();
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
@ -687,8 +683,6 @@ PARALLEL_FOR_LOOP
|
||||
std::vector<scalar_object *> rpointers(Nsimd);
|
||||
std::vector<scalar_object *> spointers(Nsimd);
|
||||
|
||||
buftime+=usecond();
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
@ -745,9 +739,7 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
commstime-=usecond();
|
||||
AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
|
||||
commstime+=usecond();
|
||||
|
||||
rpointers[i] = rp;
|
||||
|
||||
|
@ -99,6 +99,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
||||
|
||||
// Allocate the required comms buffer
|
||||
ImportGauge(_Umu);
|
||||
alltime=0;
|
||||
commtime=0;
|
||||
jointime=0;
|
||||
dslashtime=0;
|
||||
@ -234,23 +235,23 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::Report(void)
|
||||
{
|
||||
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "join time "<<jointime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
}
|
||||
template<class Impl>
|
||||
@ -288,7 +289,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
|
||||
alltime-=usecond();
|
||||
Compressor compressor(dag);
|
||||
|
||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||
@ -413,6 +414,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
dslashtime +=usecond();
|
||||
alltime+=usecond();
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -421,6 +423,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
alltime-=usecond();
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
@ -541,6 +544,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
dslash1time +=usecond();
|
||||
alltime+=usecond();
|
||||
|
||||
}
|
||||
|
||||
|
@ -61,6 +61,7 @@ namespace Grid {
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
typedef WilsonKernels<Impl> Kernels;
|
||||
double alltime;
|
||||
double jointime;
|
||||
double commtime;
|
||||
double dslashtime;
|
||||
|
@ -318,21 +318,21 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
REGISTER Simd result_00; // 12 regs on knc
|
||||
REGISTER Simd result_01;
|
||||
REGISTER Simd result_02;
|
||||
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
||||
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
||||
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_10;
|
||||
REGISTER Simd result_11;
|
||||
REGISTER Simd result_12;
|
||||
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
||||
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
||||
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_20;
|
||||
REGISTER Simd result_21;
|
||||
REGISTER Simd result_22;
|
||||
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
||||
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
||||
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_30;
|
||||
REGISTER Simd result_31;
|
||||
REGISTER Simd result_32; // 20 left
|
||||
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
||||
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
||||
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
||||
|
||||
REGISTER Simd Chi_00; // two spinor; 6 regs
|
||||
REGISTER Simd Chi_01;
|
||||
@ -372,172 +372,178 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset,local,perm, ptype;
|
||||
|
||||
int offset, ptype;
|
||||
int num = 0;
|
||||
|
||||
// Xp
|
||||
SE=st.GetEntry(ptype,Xp,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XP_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
|
||||
{
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xp);
|
||||
XP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
XP_RECON;
|
||||
|
||||
|
||||
// Yp
|
||||
SE=st.GetEntry(ptype,Yp,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YP_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
{
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Yp);
|
||||
YP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
YP_RECON_ACCUM;
|
||||
|
||||
|
||||
// Zp
|
||||
SE=st.GetEntry(ptype,Zp,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZP_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
{
|
||||
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zp);
|
||||
ZP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
ZP_RECON_ACCUM;
|
||||
|
||||
// Tp
|
||||
SE=st.GetEntry(ptype,Tp,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TP_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
{
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tp);
|
||||
TP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
TP_RECON_ACCUM;
|
||||
|
||||
// Xm
|
||||
SE=st.GetEntry(ptype,Xm,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XM_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
{
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xm);
|
||||
XM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
XM_RECON_ACCUM;
|
||||
|
||||
// Ym
|
||||
SE=st.GetEntry(ptype,Ym,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YM_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
{
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Ym);
|
||||
YM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
YM_RECON_ACCUM;
|
||||
|
||||
// Zm
|
||||
SE=st.GetEntry(ptype,Zm,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZM_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
{
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zm);
|
||||
ZM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
ZM_RECON_ACCUM;
|
||||
|
||||
// Tm
|
||||
SE=st.GetEntry(ptype,Tm,ss);
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
perm = SE->_permute;
|
||||
|
||||
if ( local ) {
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TM_PROJ;
|
||||
if ( perm) {
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
{
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tm);
|
||||
TM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
TM_RECON_ACCUM;
|
||||
|
||||
{
|
||||
SiteSpinor & ref (out._odata[ss]);
|
||||
SiteSpinor & ref (out._odata[ss]);
|
||||
if ( Local ) {
|
||||
vstream(ref()(0)(0),result_00*(-0.5));
|
||||
vstream(ref()(0)(1),result_01*(-0.5));
|
||||
vstream(ref()(0)(2),result_02*(-0.5));
|
||||
@ -550,9 +556,289 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
vstream(ref()(3)(0),result_30*(-0.5));
|
||||
vstream(ref()(3)(1),result_31*(-0.5));
|
||||
vstream(ref()(3)(2),result_32*(-0.5));
|
||||
} else if ( num ) {
|
||||
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
||||
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
||||
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
|
||||
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
|
||||
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
|
||||
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
|
||||
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
|
||||
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
|
||||
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
|
||||
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
||||
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
||||
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
{
|
||||
// std::cout << "Hand op Dhop "<<std::endl;
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
||||
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
||||
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
||||
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
||||
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
||||
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
||||
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
||||
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
||||
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
||||
|
||||
REGISTER Simd Chi_00; // two spinor; 6 regs
|
||||
REGISTER Simd Chi_01;
|
||||
REGISTER Simd Chi_02;
|
||||
|
||||
REGISTER Simd Chi_10;
|
||||
REGISTER Simd Chi_11;
|
||||
REGISTER Simd Chi_12; // 14 left
|
||||
|
||||
REGISTER Simd UChi_00; // two spinor; 6 regs
|
||||
REGISTER Simd UChi_01;
|
||||
REGISTER Simd UChi_02;
|
||||
|
||||
REGISTER Simd UChi_10;
|
||||
REGISTER Simd UChi_11;
|
||||
REGISTER Simd UChi_12; // 8 left
|
||||
|
||||
REGISTER Simd U_00; // two rows of U matrix
|
||||
REGISTER Simd U_10;
|
||||
REGISTER Simd U_20;
|
||||
REGISTER Simd U_01;
|
||||
REGISTER Simd U_11;
|
||||
REGISTER Simd U_21; // 2 reg left.
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset, ptype;
|
||||
int num = 0;
|
||||
|
||||
// Xp
|
||||
SE=st.GetEntry(ptype,Xp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xp);
|
||||
XM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
|
||||
// Yp
|
||||
SE=st.GetEntry(ptype,Yp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Yp);
|
||||
YM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
|
||||
// Zp
|
||||
SE=st.GetEntry(ptype,Zp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zp);
|
||||
ZM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Tp
|
||||
SE=st.GetEntry(ptype,Tp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tp);
|
||||
TM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Xm
|
||||
SE=st.GetEntry(ptype,Xm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xm);
|
||||
XP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Ym
|
||||
SE=st.GetEntry(ptype,Ym,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Ym);
|
||||
YP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Zm
|
||||
SE=st.GetEntry(ptype,Zm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zm);
|
||||
ZP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Tm
|
||||
SE=st.GetEntry(ptype,Tm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tm);
|
||||
TP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
SiteSpinor & ref (out._odata[ss]);
|
||||
if ( Local ) {
|
||||
vstream(ref()(0)(0),result_00*(-0.5));
|
||||
vstream(ref()(0)(1),result_01*(-0.5));
|
||||
vstream(ref()(0)(2),result_02*(-0.5));
|
||||
vstream(ref()(1)(0),result_10*(-0.5));
|
||||
vstream(ref()(1)(1),result_11*(-0.5));
|
||||
vstream(ref()(1)(2),result_12*(-0.5));
|
||||
vstream(ref()(2)(0),result_20*(-0.5));
|
||||
vstream(ref()(2)(1),result_21*(-0.5));
|
||||
vstream(ref()(2)(2),result_22*(-0.5));
|
||||
vstream(ref()(3)(0),result_30*(-0.5));
|
||||
vstream(ref()(3)(1),result_31*(-0.5));
|
||||
vstream(ref()(3)(2),result_32*(-0.5));
|
||||
} else if ( num ) {
|
||||
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
||||
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
||||
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
|
||||
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
|
||||
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
|
||||
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
|
||||
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
|
||||
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
|
||||
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
|
||||
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
||||
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
||||
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
@ -795,7 +1081,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
||||
vstream(ref()(3)(2),result_32*(-0.5));
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
////////////////////////////////////////////////
|
||||
// Specialise Gparity to simple implementation
|
||||
////////////////////////////////////////////////
|
||||
|
Loading…
x
Reference in New Issue
Block a user