1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00

configure

This commit is contained in:
paboyle 2016-03-30 00:17:43 -07:00
commit 2ded354403
13 changed files with 539 additions and 1215 deletions

View File

@ -62,6 +62,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <serialisation/Serialisation.h>
#include <Config.h>
#include <Timer.h>
#include <PerfCount.h>
#include <Log.h>
#include <AlignedAllocator.h>
#include <Simd.h>

View File

@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <ctime>
#include <chrono>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#ifdef __linux__
@ -163,8 +163,8 @@ public:
{
#ifdef __linux__
if ( fd!= -1) {
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
}
begin =cyclecount();
#else
@ -176,7 +176,7 @@ public:
count=0;
#ifdef __linux__
if ( fd!= -1) {
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
::read(fd, &count, sizeof(long long));
}
elapsed = cyclecount() - begin;
@ -187,16 +187,16 @@ public:
}
void Report(void) {
#ifdef __linux__
printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
#else
printf("%llu cycles \n", elapsed );
std::printf("%llu cycles \n", elapsed );
#endif
}
~PerformanceCounter()
{
#ifdef __linux__
close(fd);
::close(fd);
#endif
}

View File

@ -42,6 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)

View File

@ -335,69 +335,7 @@ PARALLEL_FOR_LOOP
void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) {
assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag);
auto handle = st.HaloExchangeBegin(in,compressor);
bool local = true;
bool nonlocal = false;
if ( dag == DaggerYes ) {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
}
st.HaloExchangeComplete(handle);
local = false;
nonlocal = true;
if ( dag == DaggerYes ) {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
}
}
}
assert(0);
};

View File

@ -281,11 +281,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
// if ( Impl::overlapCommsCompute () ) {
// DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
// } else {
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
// }
}
template<class Impl>
@ -368,7 +364,7 @@ PARALLEL_FOR_LOOP
sU = lo.Reorder(sU);
}
sF = s+Ls*sU;
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
@ -428,130 +424,6 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
const FermionField &in, FermionField &out,int dag)
{
assert(0);
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
int threads = GridThread::GetThreads();
int HT = GridThread::GetHyperThreads();
int cores = GridThread::GetCores();
int nwork = U._grid->oSites();
commtime -=usecond();
auto handle = st.HaloExchangeBegin(in,compressor);
commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
bool local = true;
bool nonlocal = false;
dslashtime -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
{
int sd;
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
} else {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
dslashtime +=usecond();
jointime -=usecond();
st.HaloExchangeComplete(handle);
jointime +=usecond();
local = false;
nonlocal = true;
dslash1time -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
{
int sd;
for(sd=0;sd<Ls;sd++){
int sU=ss;
int sF = sd+Ls*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
} else {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss;
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
}
}
}
}
dslash1time +=usecond();
alltime+=usecond();
}
template<class Impl>

View File

@ -38,216 +38,177 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
int sF,int sU,const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
int num = 0;
result=zero;
///////////////////////////
// Xp
///////////////////////////
SE=st.GetEntry(ptype,Xp,sF);
if (local && SE->_is_local ) {
if (SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjXp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjXp(chi,in._odata[SE->_offset]);
}
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
} else {
chi_p=&buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
accumReconXp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
spReconXp(result,Uchi);
///////////////////////////
// Yp
///////////////////////////
SE=st.GetEntry(ptype,Yp,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjYp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjYp(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
accumReconYp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
accumReconYp(result,Uchi);
///////////////////////////
// Zp
///////////////////////////
SE=st.GetEntry(ptype,Zp,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjZp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjZp(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
accumReconZp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
accumReconZp(result,Uchi);
///////////////////////////
// Tp
///////////////////////////
SE=st.GetEntry(ptype,Tp,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjTp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjTp(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
accumReconTp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
accumReconTp(result,Uchi);
///////////////////////////
// Xm
///////////////////////////
SE=st.GetEntry(ptype,Xm,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjXm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjXm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
accumReconXm(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
accumReconXm(result,Uchi);
///////////////////////////
// Ym
///////////////////////////
SE=st.GetEntry(ptype,Ym,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjYm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjYm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
accumReconYm(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
accumReconYm(result,Uchi);
///////////////////////////
// Zm
///////////////////////////
SE=st.GetEntry(ptype,Zm,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjZm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjZm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
accumReconZm(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
accumReconZm(result,Uchi);
///////////////////////////
// Tm
///////////////////////////
SE=st.GetEntry(ptype,Tm,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjTm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjTm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
accumReconTm(result,Uchi);
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
accumReconTm(result,Uchi);
num++;
}
if ( local ) {
vstream(out._odata[sF],result);
} else if ( num ) {
vstream(out._odata[sF],out._odata[sF]+result);
}
vstream(out._odata[sF],result);
};
@ -255,216 +216,177 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
int sF,int sU,const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
int num = 0;
result=zero;
///////////////////////////
// Xp
///////////////////////////
SE=st.GetEntry(ptype,Xm,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjXp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjXp(chi,in._odata[SE->_offset]);
}
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
} else {
chi_p=&buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
accumReconXp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
spReconXp(result,Uchi);
///////////////////////////
// Yp
///////////////////////////
SE=st.GetEntry(ptype,Ym,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjYp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjYp(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
accumReconYp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
accumReconYp(result,Uchi);
///////////////////////////
// Zp
///////////////////////////
SE=st.GetEntry(ptype,Zm,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjZp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjZp(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
accumReconZp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
accumReconZp(result,Uchi);
///////////////////////////
// Tp
///////////////////////////
SE=st.GetEntry(ptype,Tm,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjTp(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjTp(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
accumReconTp(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
accumReconTp(result,Uchi);
///////////////////////////
// Xm
///////////////////////////
SE=st.GetEntry(ptype,Xp,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjXm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjXm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
accumReconXm(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
accumReconXm(result,Uchi);
///////////////////////////
// Ym
///////////////////////////
SE=st.GetEntry(ptype,Yp,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjYm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjYm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
accumReconYm(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
accumReconYm(result,Uchi);
///////////////////////////
// Zm
///////////////////////////
SE=st.GetEntry(ptype,Zp,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjZm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjZm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
accumReconZm(result,Uchi);
num++;
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
accumReconZm(result,Uchi);
///////////////////////////
// Tm
///////////////////////////
SE=st.GetEntry(ptype,Tp,sF);
if (local && SE->_is_local ) {
if ( SE->_is_local ) {
chi_p = &chi;
if ( SE->_permute ) {
spProjTm(tmp,in._odata[SE->_offset]);
permute(chi,tmp,ptype);
} else {
spProjTm(chi,in._odata[SE->_offset]);
}
} else {
chi_p=&buf[SE->_offset];
}
if ( nonlocal && (!SE->_is_local) ) {
chi=buf[SE->_offset];
}
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
accumReconTm(result,Uchi);
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
accumReconTm(result,Uchi);
num++;
}
if ( local ) {
vstream(out._odata[sF],result);
} else if ( num ) {
vstream(out._odata[sF],out._odata[sF]+result);
}
vstream(out._odata[sF],result);
};
template<class Impl>
@ -596,11 +518,11 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
vstream(out._odata[sF],result);
}
#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
#if ( ! defined(IMCI) && ! defined(AVX512) )
template<class Impl>
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
}

View File

@ -48,11 +48,11 @@ namespace Grid {
public:
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
int sF,int sU,const FermionField &in,FermionField &out);
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
@ -60,15 +60,15 @@ namespace Grid {
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
int sF,int sU,const FermionField &in, FermionField &out);
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
int sF,int sU,const FermionField &in, FermionField &out);
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
int sF,int sU,const FermionField &in, FermionField &out);
WilsonKernels(const ImplParams &p= ImplParams());

View File

@ -28,6 +28,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid.h>
#if defined(AVX512) || defined (IMCI)
//#if defined (IMCI)
#include <simd/Avx512Asm.h>
@ -105,7 +106,7 @@ namespace QCD {
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
int ss,int sU,const FermionField &in, FermionField &out)
{
uint64_t now;
uint64_t first ;
@ -158,7 +159,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XM_PROJMEM(&plocal[offset]);
XP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -168,7 +169,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFXM(Xm,pf);
}
XM_RECON;
XP_RECON;
// Ym
offset = SE->_offset;
@ -181,7 +182,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YM_PROJMEM(&plocal[offset]);
YP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -191,7 +192,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFYM(Ym,pf);
}
YM_RECON_ACCUM;
YP_RECON_ACCUM;
// Zm
offset = SE->_offset;
@ -204,7 +205,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZM_PROJMEM(&plocal[offset]);
ZP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -214,7 +215,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFZM(Zm,pf);
}
ZM_RECON_ACCUM;
ZP_RECON_ACCUM;
// Tm
offset = SE->_offset;
@ -227,7 +228,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
if ( local ) {
TM_PROJMEM(&plocal[offset]);
TP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -237,7 +238,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFTM(Tm,pf);
}
TM_RECON_ACCUM;
TP_RECON_ACCUM;
// Tp
offset = SE->_offset;
@ -250,7 +251,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
TP_PROJMEM(&plocal[offset]);
TM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -260,7 +261,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFTP(Tp,pf);
}
TP_RECON_ACCUM;
TM_RECON_ACCUM;
// Zp
offset = SE->_offset;
@ -273,7 +274,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZP_PROJMEM(&plocal[offset]);
ZM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -283,7 +284,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFZP(Zp,pf);
}
ZP_RECON_ACCUM;
ZM_RECON_ACCUM;
offset = SE->_offset;
@ -296,7 +297,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YP_PROJMEM(&plocal[offset]);
YM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -306,7 +307,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFYP(Yp,pf);
}
YP_RECON_ACCUM;
YM_RECON_ACCUM;
// Xp
perm = SE->_permute;
@ -321,7 +322,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XP_PROJMEM(&plocal[offset]);
XM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
@ -331,7 +332,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
{
MULT_2SPIN_DIR_PFXP(Xp,pf);
}
XP_RECON_ACCUM;
XM_RECON_ACCUM;
debug:
SAVE_RESULT(&out._odata[ss]);
@ -340,6 +341,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
template class WilsonKernels<WilsonImplF>;
template class WilsonKernels<WilsonImplD>;
template class WilsonKernels<GparityWilsonImplF>;
template class WilsonKernels<GparityWilsonImplD>;
}}
#endif

View File

@ -308,548 +308,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
namespace QCD {
#if 0
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
{
// std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
REGISTER Simd Chi_00; // two spinor; 6 regs
REGISTER Simd Chi_01;
REGISTER Simd Chi_02;
REGISTER Simd Chi_10;
REGISTER Simd Chi_11;
REGISTER Simd Chi_12; // 14 left
REGISTER Simd UChi_00; // two spinor; 6 regs
REGISTER Simd UChi_01;
REGISTER Simd UChi_02;
REGISTER Simd UChi_10;
REGISTER Simd UChi_11;
REGISTER Simd UChi_12; // 8 left
REGISTER Simd U_00; // two rows of U matrix
REGISTER Simd U_10;
REGISTER Simd U_20;
REGISTER Simd U_01;
REGISTER Simd U_11;
REGISTER Simd U_21; // 2 reg left.
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
StencilEntry *SE;
int offset, ptype;
int num = 0;
// Xp
SE=st.GetEntry(ptype,Xp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xp);
XP_RECON_ACCUM;
num++;
}
// Yp
SE=st.GetEntry(ptype,Yp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Yp);
YP_RECON_ACCUM;
num++;
}
// Zp
SE=st.GetEntry(ptype,Zp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zp);
ZP_RECON_ACCUM;
num++;
}
// Tp
SE=st.GetEntry(ptype,Tp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tp);
TP_RECON_ACCUM;
num++;
}
// Xm
SE=st.GetEntry(ptype,Xm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xm);
XM_RECON_ACCUM;
num++;
}
// Ym
SE=st.GetEntry(ptype,Ym,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Ym);
YM_RECON_ACCUM;
num++;
}
// Zm
SE=st.GetEntry(ptype,Zm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zm);
ZM_RECON_ACCUM;
num++;
}
// Tm
SE=st.GetEntry(ptype,Tm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tm);
TM_RECON_ACCUM;
num++;
}
SiteSpinor & ref (out._odata[ss]);
if ( Local ) {
vstream(ref()(0)(0),result_00);
vstream(ref()(0)(1),result_01);
vstream(ref()(0)(2),result_02);
vstream(ref()(1)(0),result_10);
vstream(ref()(1)(1),result_11);
vstream(ref()(1)(2),result_12);
vstream(ref()(2)(0),result_20);
vstream(ref()(2)(1),result_21);
vstream(ref()(2)(2),result_22);
vstream(ref()(3)(0),result_30);
vstream(ref()(3)(1),result_31);
vstream(ref()(3)(2),result_32);
return 1;
} else if ( num ) {
vstream(ref()(0)(0),ref()(0)(0)+result_00);
vstream(ref()(0)(1),ref()(0)(1)+result_01);
vstream(ref()(0)(2),ref()(0)(2)+result_02);
vstream(ref()(1)(0),ref()(1)(0)+result_10);
vstream(ref()(1)(1),ref()(1)(1)+result_11);
vstream(ref()(1)(2),ref()(1)(2)+result_12);
vstream(ref()(2)(0),ref()(2)(0)+result_20);
vstream(ref()(2)(1),ref()(2)(1)+result_21);
vstream(ref()(2)(2),ref()(2)(2)+result_22);
vstream(ref()(3)(0),ref()(3)(0)+result_30);
vstream(ref()(3)(1),ref()(3)(1)+result_31);
vstream(ref()(3)(2),ref()(3)(2)+result_32);
return 1;
}
return 0;
}
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
{
// std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
REGISTER Simd Chi_00; // two spinor; 6 regs
REGISTER Simd Chi_01;
REGISTER Simd Chi_02;
REGISTER Simd Chi_10;
REGISTER Simd Chi_11;
REGISTER Simd Chi_12; // 14 left
REGISTER Simd UChi_00; // two spinor; 6 regs
REGISTER Simd UChi_01;
REGISTER Simd UChi_02;
REGISTER Simd UChi_10;
REGISTER Simd UChi_11;
REGISTER Simd UChi_12; // 8 left
REGISTER Simd U_00; // two rows of U matrix
REGISTER Simd U_10;
REGISTER Simd U_20;
REGISTER Simd U_01;
REGISTER Simd U_11;
REGISTER Simd U_21; // 2 reg left.
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
StencilEntry *SE;
int offset, ptype;
int num = 0;
// Xp
SE=st.GetEntry(ptype,Xp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xp);
XM_RECON_ACCUM;
num++;
}
// Yp
SE=st.GetEntry(ptype,Yp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Yp);
YM_RECON_ACCUM;
num++;
}
// Zp
SE=st.GetEntry(ptype,Zp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zp);
ZM_RECON_ACCUM;
num++;
}
// Tp
SE=st.GetEntry(ptype,Tp,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TM_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tp);
TM_RECON_ACCUM;
num++;
}
// Xm
SE=st.GetEntry(ptype,Xm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
XP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Xm);
XP_RECON_ACCUM;
num++;
}
// Ym
SE=st.GetEntry(ptype,Ym,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
YP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Ym);
YP_RECON_ACCUM;
num++;
}
// Zm
SE=st.GetEntry(ptype,Zm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
ZP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Zm);
ZP_RECON_ACCUM;
num++;
}
// Tm
SE=st.GetEntry(ptype,Tm,ss);
offset = SE->_offset;
if (Local && SE->_is_local ) {
LOAD_CHIMU;
TP_PROJ;
if ( SE->_permute ) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
}
if ( Nonlocal && (!SE->_is_local) ) {
LOAD_CHI;
}
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
MULT_2SPIN(Tm);
TP_RECON_ACCUM;
num++;
}
SiteSpinor & ref (out._odata[ss]);
if ( Local ) {
vstream(ref()(0)(0),result_00);
vstream(ref()(0)(1),result_01);
vstream(ref()(0)(2),result_02);
vstream(ref()(1)(0),result_10);
vstream(ref()(1)(1),result_11);
vstream(ref()(1)(2),result_12);
vstream(ref()(2)(0),result_20);
vstream(ref()(2)(1),result_21);
vstream(ref()(2)(2),result_22);
vstream(ref()(3)(0),result_30);
vstream(ref()(3)(1),result_31);
vstream(ref()(3)(2),result_32);
return 1;
} else if ( num ) {
vstream(ref()(0)(0),ref()(0)(0)+result_00);
vstream(ref()(0)(1),ref()(0)(1)+result_01);
vstream(ref()(0)(2),ref()(0)(2)+result_02);
vstream(ref()(1)(0),ref()(1)(0)+result_10);
vstream(ref()(1)(1),ref()(1)(1)+result_11);
vstream(ref()(1)(2),ref()(1)(2)+result_12);
vstream(ref()(2)(0),ref()(2)(0)+result_20);
vstream(ref()(2)(1),ref()(2)(1)+result_21);
vstream(ref()(2)(2),ref()(2)(2)+result_22);
vstream(ref()(3)(0),ref()(3)(0)+result_30);
vstream(ref()(3)(1),ref()(3)(1)+result_31);
vstream(ref()(3)(2),ref()(3)(2)+result_32);
return 1;
}
return 0;
}
#else
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
@ -1094,7 +557,7 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out,bool l, bool nl)
int ss,int sU,const FermionField &in, FermionField &out)
{
// std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S;
@ -1337,14 +800,13 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
}
#endif
////////////////////////////////////////////////
// Specialise Gparity to simple implementation
////////////////////////////////////////////////
template<>
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
//check consistency of return types between these functions and the ones in WilsonKernels.cc
@ -1355,7 +817,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
template<>
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
@ -1364,7 +826,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,D
template<>
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
@ -1373,7 +835,7 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Doub
template<>
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
@ -1383,29 +845,29 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
int ss,int sU,const FermionField &in, FermionField &out);
}}

View File

@ -69,6 +69,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define UChi_12 %zmm23
#define Uir %zmm24
//#define ONE %zmm24
#define Uri %zmm25
#define Z0 %zmm26
@ -97,16 +98,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// CONFIG IMCI/AVX512
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef IMCI
#define ASM_IMCI
#undef ASM_AVX512
#endif
#ifdef AVX512
#define ASM_AVX512
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// Opcodes common to AVX512 and IMCI
// Opcodes common
////////////////////////////////////////////////////////////////////////////////////////////////////
#define MASK_REGS \
__asm__ ("mov $0xAAAA, %%eax \n"\
"kmov %%eax, %%k6 \n"\
"knot %%k6, %%k7 \n" : : : "%eax");
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
@ -136,11 +138,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VACCTIMESI1f(A,ACC,tmp) \
VACCTIMESI2f(A,ACC,tmp)
#define VACCTIMESI1MEMf(A,ACC,O,P) "vaddps " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
#define VACCTIMESI2MEMf(A,ACC,O,P) "vsubrps " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI1MEMf(A,ACC,O,P) "vsubrps " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2MEMf(A,ACC,O,P) "vaddps " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
#define VACCTIMESId(A,ACC,tmp) \
VACCTIMESI0d(A,ACC,tmp) \
VACCTIMESI1d(A,ACC,tmp) \
@ -157,14 +154,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VACCTIMESMINUSI2d(A,ACC,tmp)
#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A );
#define LOAD64(A,ptr) LOAD64i(A,ptr)
#define LOAD64(A,ptr) LOAD64i(A,ptr)
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
// Field prefetch
#define VPREFETCHNTA(O,A) "vprefetchnta "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
#define VPREFETCH(O,A) "vprefetch0 "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
#define VPREFETCHG(O,A)
#define VPREFETCHW(O,A)
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
@ -210,8 +205,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
// Need VSHUFMULMEMf,d for KNC
// AVX512 friendly
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
VSHUFMEMf(O,P,tmp) \
VMULMEMf(O,P,B,Biirr) \
@ -243,99 +236,107 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VMADDd(tmp,C,Criir)
////////////////////////////////////////////////////////////////////////////////////////////////////
// Lane swizzling changed between AVX512 and IMCI and requires arch dependent complex support
// ISA changed between AVX512 and IMCI and requires arch dependent complex support
////////////////////////////////////////////////////////////////////////////////////////////////////
// AVX512 special (Knights Landing)
#define VPREFETCHNTA(O,A)
#define VPREFETCH(O,A)
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
// Swaps Re/Im ; could unify this with IMCI
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1
////////////////////////////////////////////////////////////
// Knights Landing specials
////////////////////////////////////////////////////////////
#ifdef ASM_AVX512
#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
// Swaps Re/Im
#define VSHUFd(A,DEST) "vshufpd $0x5, " #A "," #A "," #DEST ";\n"
#define VSHUFf(A,DEST) "vshufps $0x55," #A "," #A "," #DEST ";\n"
// Memops are useful for optimisation
#define VSHUFMEMd(OFF,A,DEST) "vpshufpd $0x4e, " #OFF"("#A ")," #DEST ";\n"
#define VSHUFMEMf(OFF,A,DEST) "vpshufps $0xb1, " #OFF"("#A ")," #DEST ";\n"
#define MASK_REGS \
__asm__ ("mov $0xAAAA, %%eax \n"\
"kmovw %%eax, %%k6 \n"\
"mov $0x5555, %%eax \n"\
"kmovw %%eax, %%k7 \n" : : : "%eax");
// Merges accumulation for complex dot chain
// TODO: 12 operation saving:
// # could SWIZ op 18{cdab} and eliminate temporary // 12cycles
// # no use KNL though. Fingour something else there.
// # All swizzles become perms ops, but gain addsub; subadd must use this
// # uint32_t (0x7F << 23 )
// # uint64_t (0x3FF<< 52 ) ; vpbroadcast
#define ZEND1f(Criir,Ciirr, tmp) \
"vshufps $0xb1," #Ciirr "," #Criir "," #tmp ";\n"\
"vaddps " #Criir "," #tmp "," #Criir"{%k6}" ";\n"
// Merges accumulation for complex dot chain; less efficient under avx512
//ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
//ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
//ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
//ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n"
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
#define ZEND2d(Criir,Ciirr, tmp) \
"vshufpd $0x33," #Ciirr "," #Criir "," #tmp ";\n"\
"vaddpd " #Criir "," #tmp "," #Criir"{%k6}" ";\n"
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n"
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
"vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
// Further opt possible: KNC -- use swizzle operand ; no addsub.
// KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub
// no swizzle on KNL.
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#define VPERM0f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
#define VPERM1f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
#define VPERM2f(A,B) "vshufps " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
#define VPERM3f(A,B) "vshufps " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n"
#define VPERM0d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
#define VPERM1d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
#define VPERM2d(A,B) "vshufpd " #A "," #B "," "#B" ", " 0x55 ";\n"
#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n"
#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n"
#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n"
#define VPERM3d(A,B) VMOVd(A,B)
#endif
////////////////////////////////////////////////////////////
// Knights Corner specials
////////////////////////////////////////////////////////////
#ifdef ASM_IMCI
#define VSTOREf(OFF,PTR,SRC) "vmovnrngoaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSTOREd(OFF,PTR,SRC) "vmovnrngoapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
//#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
//#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
#define VSHUFf(A,DEST) "vmovaps " #A "{cdab} , " #DEST ";\n"
#define VSHUFd(A,DEST) "vmovapd " #A "{cdab} , " #DEST ";\n"
// Memops are useful for optimisation
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n"
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n"
#define MASK_REGS \
__asm__ ("mov $0xAAAA, %%eax \n"\
"kmov %%eax, %%k6 \n"\
"knot %%k6, %%k7 \n" : : : "%eax");
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
@ -374,12 +375,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
// Acc = Acc - i A
#define VACCTIMESMINUSI0d(A,ACC,tmp)
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
//#define ZENDf(Criir,Ciirr, tmp)
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
@ -394,18 +394,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VPERM3d(A,B) VMOVd(A,B)
#endif
// const SiteSpinor * ptr = & in._odata[offset];
#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR)
#define LOAD_CHI(PTR) LOAD_CHIi(PTR)
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
#define LOAD_CHIMUi(PTR) \
LOAD64(%r8,PTR) \
__asm__ (\
#define LOAD_CHIMUi \
LOAD_CHIMU01i \
LOAD_CHIMU23i );
@ -437,16 +434,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// const SiteHalfSpinor *ptr = &buf[offset];
#define LOAD_CHIi(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
#define LOAD_CHIi \
VLOAD(0,%r8,Chi_00) \
VLOAD(1,%r8,Chi_01) \
VLOAD(2,%r8,Chi_02) \
VLOAD(3,%r8,Chi_10) \
VLOAD(4,%r8,Chi_11) \
VLOAD(5,%r8,Chi_12) \
);
VLOAD(5,%r8,Chi_12)
#define SAVE_UCHIi(PTR) \
LOAD64(%r8,PTR) \
@ -585,7 +580,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
ZEND2(UChi_12,Z5,Chi_12) );
#define MULT_2SPIN(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
@ -667,56 +661,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
//define VTIMESIf(A,DEST, Z)
// These don't work if DEST==Z. FIXME.
#define XP_PROJ __asm__ ( \
VACCTIMESI(Chimu_30,Chi_00,Z0) \
VACCTIMESI(Chimu_31,Chi_01,Z1) \
VACCTIMESI(Chimu_32,Chi_02,Z2) \
VACCTIMESI(Chimu_20,Chi_10,Z3) \
VACCTIMESI(Chimu_21,Chi_11,Z4) \
VACCTIMESI(Chimu_22,Chi_12,Z5) );
#define XP_PROJMEM(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
LOAD_CHIi \
SHUF_CHIMU23i \
VACCTIMESI1MEM(Chimu_30,Chi_00,0,%r8) \
VACCTIMESI1MEM(Chimu_31,Chi_01,1,%r8) \
VACCTIMESI1MEM(Chimu_32,Chi_02,2,%r8) \
VACCTIMESI1MEM(Chimu_20,Chi_10,3,%r8) \
VACCTIMESI1MEM(Chimu_21,Chi_11,4,%r8) \
VACCTIMESI1MEM(Chimu_22,Chi_12,5,%r8) \
VACCTIMESI2MEM(Chimu_30,Chi_00,0,%r8) \
VACCTIMESI2MEM(Chimu_31,Chi_01,1,%r8) \
VACCTIMESI2MEM(Chimu_32,Chi_02,2,%r8) \
VACCTIMESI2MEM(Chimu_20,Chi_10,3,%r8) \
VACCTIMESI2MEM(Chimu_21,Chi_11,4,%r8) \
VACCTIMESI2MEM(Chimu_22,Chi_12,5,%r8) );
#define YP_PROJ __asm__ ( \
VSUB(Chimu_30,Chimu_00,Chi_00)\
VSUB(Chimu_31,Chimu_01,Chi_01)\
VSUB(Chimu_32,Chimu_02,Chi_02)\
VADD(Chimu_10,Chimu_20,Chi_10)\
VADD(Chimu_11,Chimu_21,Chi_11)\
VADD(Chimu_12,Chimu_22,Chi_12) );
#define EVICT_SPINOR(reg) \
VEVICT(0,reg) \
VEVICT(1,reg) \
VEVICT(2,reg) \
VEVICT(3,reg) \
VEVICT(4,reg) \
VEVICT(5,reg) \
VEVICT(6,reg) \
VEVICT(7,reg) \
VEVICT(8,reg) \
VEVICT(9,reg) \
VEVICT(9,reg) \
VEVICT(10,reg) \
VEVICT(11,reg)
VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \
VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \
VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \
VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \
VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \
VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \
VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \
VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \
VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \
VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \
VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \
VACCTIMESI2(Chi_12,Chi_12,Chimu_22) );
#define YP_PROJMEM(ptr) \
@ -729,43 +690,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VADDMEM(6,%r8,Chimu_10,Chi_10) \
VADDMEM(7,%r8,Chimu_11,Chi_11) \
VADDMEM(8,%r8,Chimu_12,Chi_12) );
// EVICT_SPINOR(%r8) );
#define ZP_PROJ __asm__ ( \
VACCTIMESI(Chimu_20,Chi_00,Z0) \
VACCTIMESI(Chimu_21,Chi_01,Z1) \
VACCTIMESI(Chimu_22,Chi_02,Z2) \
VACCTIMESMINUSI(Chimu_30,Chi_10,Z3) \
VACCTIMESMINUSI(Chimu_31,Chi_11,Z4) \
VACCTIMESMINUSI(Chimu_32,Chi_12,Z5) );
#define ZP_PROJMEM(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
LOAD_CHIi \
SHUF_CHIMU23i \
VACCTIMESI1MEM(Chimu_20,Chi_00,0,%r8) \
VACCTIMESI1MEM(Chimu_21,Chi_01,1,%r8) \
VACCTIMESI1MEM(Chimu_22,Chi_02,2,%r8) \
VACCTIMESMINUSI1MEM(Chimu_30,Chi_10,3,%r8) \
VACCTIMESMINUSI1MEM(Chimu_31,Chi_11,4,%r8) \
VACCTIMESMINUSI1MEM(Chimu_32,Chi_12,5,%r8) \
VACCTIMESI2MEM(Chimu_20,Chi_00,0,%r8) \
VACCTIMESI2MEM(Chimu_21,Chi_01,1,%r8) \
VACCTIMESI2MEM(Chimu_22,Chi_02,2,%r8) \
VACCTIMESMINUSI2MEM(Chimu_30,Chi_10,3,%r8) \
VACCTIMESMINUSI2MEM(Chimu_31,Chi_11,4,%r8) \
VACCTIMESMINUSI2MEM(Chimu_32,Chi_12,5,%r8) \
EVICT_SPINOR(%r8) );
#define TP_PROJ __asm__ ( \
VADD(Chimu_00,Chimu_20,Chi_00) \
VADD(Chimu_01,Chimu_21,Chi_01) \
VADD(Chimu_02,Chimu_22,Chi_02) \
VADD(Chimu_10,Chimu_30,Chi_10) \
VADD(Chimu_11,Chimu_31,Chi_11) \
VADD(Chimu_12,Chimu_32,Chi_12) );
VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \
VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \
VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \
VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \
VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \
VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) );
#define TP_PROJMEM(ptr) \
@ -777,44 +719,28 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VADDMEM(8,%r8,Chimu_02,Chi_02) \
VADDMEM(9,%r8,Chimu_10,Chi_10) \
VADDMEM(10,%r8,Chimu_11,Chi_11) \
VADDMEM(11,%r8,Chimu_12,Chi_12) \
EVICT_SPINOR(%r8) );
VADDMEM(11,%r8,Chimu_12,Chi_12) );
// hspin(0)=fspin(0)-timesI(fspin(3))
// hspin(1)=fspin(1)-timesI(fspin(2))
#define XM_PROJ __asm__ ( \
VACCTIMESMINUSI(Chimu_30,Chi_00,Z0) \
VACCTIMESMINUSI(Chimu_31,Chi_01,Z1) \
VACCTIMESMINUSI(Chimu_32,Chi_02,Z2) \
VACCTIMESMINUSI(Chimu_20,Chi_10,Z3) \
VACCTIMESMINUSI(Chimu_21,Chi_11,Z4) \
VACCTIMESMINUSI(Chimu_22,Chi_12,Z5) );
#define XM_PROJMEM(PTR) \
LOAD64(%r8,PTR) \
LOAD64(%r8,PTR)\
__asm__ ( \
SHUF_CHIMU23i \
VACCTIMESMINUSI1MEM(Chimu_30,Chi_00,0,%r8) \
VACCTIMESMINUSI1MEM(Chimu_31,Chi_01,1,%r8) \
VACCTIMESMINUSI1MEM(Chimu_32,Chi_02,2,%r8) \
VACCTIMESMINUSI1MEM(Chimu_20,Chi_10,3,%r8) \
VACCTIMESMINUSI1MEM(Chimu_21,Chi_11,4,%r8) \
VACCTIMESMINUSI1MEM(Chimu_22,Chi_12,5,%r8) \
VACCTIMESMINUSI2MEM(Chimu_30,Chi_00,0,%r8) \
VACCTIMESMINUSI2MEM(Chimu_31,Chi_01,1,%r8) \
VACCTIMESMINUSI2MEM(Chimu_32,Chi_02,2,%r8) \
VACCTIMESMINUSI2MEM(Chimu_20,Chi_10,3,%r8) \
VACCTIMESMINUSI2MEM(Chimu_21,Chi_11,4,%r8) \
VACCTIMESMINUSI2MEM(Chimu_22,Chi_12,5,%r8) );
#define YM_PROJ __asm__ ( \
VADD(Chimu_00,Chimu_30,Chi_00)\
VADD(Chimu_01,Chimu_31,Chi_01)\
VADD(Chimu_02,Chimu_32,Chi_02)\
VSUB(Chimu_20,Chimu_10,Chi_10)\
VSUB(Chimu_21,Chimu_11,Chi_11)\
VSUB(Chimu_22,Chimu_12,Chi_12) );
LOAD_CHIi \
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
#define YM_PROJMEM(ptr) \
LOAD64(%r8,ptr) \
@ -825,45 +751,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VADDMEM(11,%r8,Chimu_02,Chi_02) \
VSUBMEM(6,%r8,Chimu_10,Chi_10) \
VSUBMEM(7,%r8,Chimu_11,Chi_11) \
VSUBMEM(8,%r8,Chimu_12,Chi_12) \
EVICT_SPINOR(%r8) );
#define ZM_PROJ __asm__ ( \
VACCTIMESMINUSI(Chimu_20,Chi_00,Z0)\
VACCTIMESMINUSI(Chimu_21,Chi_01,Z1)\
VACCTIMESMINUSI(Chimu_22,Chi_02,Z2)\
VACCTIMESI(Chimu_30,Chi_10,Z3)\
VACCTIMESI(Chimu_31,Chi_11,Z4)\
VACCTIMESI(Chimu_32,Chi_12,Z5));
VSUBMEM(8,%r8,Chimu_12,Chi_12) );
#define ZM_PROJMEM(PTR) \
LOAD64(%r8,PTR) \
__asm__ ( \
SHUF_CHIMU23i \
VACCTIMESMINUSI1MEM(Chimu_20,Chi_00,0,%r8) \
VACCTIMESMINUSI1MEM(Chimu_21,Chi_01,1,%r8) \
VACCTIMESMINUSI1MEM(Chimu_22,Chi_02,2,%r8) \
VACCTIMESI1MEM(Chimu_30,Chi_10,3,%r8) \
VACCTIMESI1MEM(Chimu_31,Chi_11,4,%r8) \
VACCTIMESI1MEM(Chimu_32,Chi_12,5,%r8) \
VACCTIMESMINUSI2MEM(Chimu_20,Chi_00,0,%r8) \
VACCTIMESMINUSI2MEM(Chimu_21,Chi_01,1,%r8) \
VACCTIMESMINUSI2MEM(Chimu_22,Chi_02,2,%r8) \
VACCTIMESI2MEM(Chimu_30,Chi_10,3,%r8) \
VACCTIMESI2MEM(Chimu_31,Chi_11,4,%r8) \
VACCTIMESI2MEM(Chimu_32,Chi_12,5,%r8) \
EVICT_SPINOR(%r8) );
#define TM_PROJ __asm__ ( \
VSUB(Chimu_20,Chimu_00,Chi_00)\
VSUB(Chimu_21,Chimu_01,Chi_01)\
VSUB(Chimu_22,Chimu_02,Chi_02)\
VSUB(Chimu_30,Chimu_10,Chi_10)\
VSUB(Chimu_31,Chimu_11,Chi_11)\
VSUB(Chimu_32,Chimu_12,Chi_12) );
LOAD_CHIi \
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
#define TM_PROJMEM(ptr) \
LOAD64(%r8,ptr) \
@ -874,8 +780,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
VSUBMEM(10,%r8,Chimu_11,Chi_11) \
VSUBMEM(11,%r8,Chimu_12,Chi_12) \
EVICT_SPINOR(%r8) );
VSUBMEM(11,%r8,Chimu_12,Chi_12) );
// fspin(0)=hspin(0)
// fspin(1)=hspin(1)

View File

@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <immintrin.h>
namespace Grid{
namespace Optimization {
struct Vsplat{
@ -246,26 +246,30 @@ namespace Optimization {
struct TimesMinusI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
//__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
//return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
return _mm512_shuffle_pd(tmp,tmp,0x55);
//__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
//return _mm512_shuffle_pd(tmp,tmp,0x55);
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
}
};
struct TimesI{
//Complex single
inline __m512 operator()(__m512 in, __m512 ret){
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
}
//Complex double
inline __m512d operator()(__m512d in, __m512d ret){
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
}
@ -345,7 +349,7 @@ namespace Optimization {
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
typedef __m512 SIMD_Ftype; // Single precision type
typedef __m512d SIMD_Dtype; // Double precision type
typedef __m512i SIMD_Itype; // Integer type

View File

@ -145,7 +145,7 @@ void Tester(const functor &func)
int ok=0;
for(int i=0;i<Nsimd;i++){
if ( abs(reference[i]-result[i])>0){
if ( abs(reference[i]-result[i])>1.0e-7){
std::cout<<GridLogMessage<< "*****" << std::endl;
std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
ok++;

View File

@ -32,10 +32,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
using namespace Grid;
using namespace Grid::QCD;
void ZmulF(void *ptr1,void *ptr2,void *ptr3);
void Zmul(void *ptr1,void *ptr2,void *ptr3);
void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
void TimesIAvx512F(void *ptr1,void *ptr3);
void TimesIAvx512(void *ptr1,void *ptr3);
void TimesMinusIAvx512F(void *ptr1,void *ptr3);
void TimesMinusIAvx512(void *ptr1,void *ptr3);
@ -63,50 +68,106 @@ int main(int argc,char **argv)
vColourMatrixD mat;
vHalfSpinColourVectorD vec;
vHalfSpinColourVectorD vec1;
vHalfSpinColourVectorD vec2;
vHalfSpinColourVectorD vec3;
vHalfSpinColourVectorD matvec;
vHalfSpinColourVectorD ref;
vComplexD err;
random(sRNG,vec1);
vec1 = std::complex<double>(0.1,3.0);
random(sRNG,vec2);
vec2=2.0;
random(sRNG,vec3);
//std::cout << "Zmul vec1"<<vec1<<" &vec1 "<<& vec1<<std::endl;
//std::cout << "Zmul vec2"<<vec2<<" &vec2 "<<& vec2<<std::endl;
//std::cout << "Zmul vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
for(int sp=0;sp<2;sp++){
for(int co=0;co<3;co++){
ref()(sp)(co) = vec1()(sp)(co)*vec2()(sp)(co);
}}
Zmul((void *)&vec1,(void *)&vec2,(void *)&vec3);
//std::cout << "Zmul vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
//std::cout << "Zmul \n\t ref "<<ref<<"\n\t vec3"<<vec3 <<std::endl;
ref = ref - vec3;
err = TensorRemove(innerProduct(ref,ref));
std::cout <<"Zmul diff "<< Reduce(err)<<std::endl;
random(sRNG,mat);
mat = zero;
mat()()(0,0) = 1.0;
random(sRNG,vec);
ref = mat*vec;
WilsonDslashAvx512((void *)&vec, (void *)&mat,(void *)&matvec);
//std::cout << ref <<std::endl;
//std::cout << matvec<<std::endl;
ref = ref - matvec;
err = TensorRemove(innerProduct(ref,ref));
std::cout <<"Double SU3 x 2spin diff "<< Reduce(err)<<std::endl;
vColourMatrixF matF;
vHalfSpinColourVectorF vec1F;
vHalfSpinColourVectorF vec2F;
vHalfSpinColourVectorF vec3F;
vHalfSpinColourVectorF vecF;
vHalfSpinColourVectorF matvecF;
vHalfSpinColourVectorF refF;
vComplexF errF;
random(sRNG,matF);
matF = zero;
matF()()(0,0)=1.0;
random(sRNG,vecF);
refF = matF*vecF;
WilsonDslashAvx512F((void *)&vecF, (void *)&matF,(void *)&matvecF);
//std::cout << refF <<std::endl;
//std::cout << matvecF<<std::endl;
refF = refF-matvecF;
errF = TensorRemove(innerProduct(refF,refF));
std::cout <<"Single SU3 x 2spin diff "<< Reduce(errF)<<std::endl;
TimesIAvx512F((void *)&vecF,(void *)&matvecF);
//std::cout << timesI(vecF)<<std::endl;
//std::cout << matvecF<<std::endl;
refF = timesI(vecF)-matvecF;
errF = TensorRemove(innerProduct(refF,refF));
std::cout <<" timesI single diff "<< Reduce(errF)<<std::endl;
TimesIAvx512((void *)&vec,(void *)&matvec);
//std::cout << timesI(vec)<<std::endl;
//std::cout << matvec<<std::endl;
ref = timesI(vec)-matvec;
err = TensorRemove(innerProduct(ref,ref));
std::cout <<" timesI double diff "<< Reduce(err)<<std::endl;
TimesMinusIAvx512F((void *)&vecF,(void *)&matvecF);
//std::cout << timesMinusI(vecF)<<std::endl;
//std::cout << matvecF<<std::endl;
refF = timesMinusI(vecF)-matvecF;
errF = TensorRemove(innerProduct(refF,refF));
std::cout <<" timesMinusI single diff "<< Reduce(errF)<<std::endl;
TimesMinusIAvx512((void *)&vec,(void *)&matvec);
//std::cout << timesMinusI(vec)<<std::endl;
//std::cout << matvec<<std::endl;
ref = timesMinusI(vec)-matvec;
err = TensorRemove(innerProduct(ref,ref));
std::cout <<" timesMinusI double diff "<< Reduce(err)<<std::endl;
LatticeFermion src (FGrid);
LatticeFermion tmp (FGrid);
LatticeFermion srce(FrbGrid);
LatticeFermion resulto(FrbGrid); resulto=zero;
@ -114,13 +175,14 @@ int main(int argc,char **argv)
LatticeFermion diff(FrbGrid);
LatticeGaugeField Umu(UGrid);
#if 1
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
random(RNG5,src);
#if 1
random(RNG4,Umu);
#else
int mmu=3;
int mmu=2;
std::vector<LatticeColourMatrix> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
@ -157,7 +219,7 @@ int main(int argc,char **argv)
}
t1=usecond();
#if 1
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
Dw.DhopOE(srce,resulta,0);
PerformanceCounter Counter(i);
@ -166,14 +228,28 @@ int main(int argc,char **argv)
Counter.Stop();
Counter.Report();
}
resulta = (-0.5) * resulta;
#endif
//resulta = (-0.5) * resulta;
std::cout<<GridLogMessage << "Called Asm Dw"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(resulta)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops*ncall/(t1-t0)<<std::endl;
diff = resulto-resulta;
std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
std::cout<<std::endl;
#if 0
std::cout<<"=========== result Grid ============="<<std::endl;
std::cout<<std::endl;
tmp = zero;
setCheckerboard(tmp,resulto);
std::cout<<tmp<<std::endl;
std::cout<<std::endl;
std::cout<<"=========== result ASM ============="<<std::endl;
std::cout<<std::endl;
tmp = zero;
setCheckerboard(tmp,resulta);
std::cout<<tmp<<std::endl;
#endif
}
#undef VLOAD
@ -204,12 +280,91 @@ int main(int argc,char **argv)
#define zz Z0
void Zmul(void *ptr1,void *ptr2,void *ptr3)
{
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
__asm__ ("kmovw %%eax, %%k6 " : : :);
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
__asm__ ("kmovw %%eax, %%k7 " : : :);
#define CC result_00
LOAD64(%r9,ptr1);
LOAD64(%r8,ptr2);
LOAD64(%r10,ptr3)
__asm__ (
VLOAD(0,%r8,CC)
ZLOAD(0,%r9,Chi_00,Z0)
ZMUL(Chi_00,Z0,CC,UChi_00,Z1)
//VSTORE(0,%r10,UChi_00)
//VSTORE(1,%r10,Z1)
ZEND1(UChi_00,Z1,Z0)
//VSTORE(2,%r10,UChi_00)
ZEND2(UChi_00,Z1,Z0)
//VSTORE(3,%r10,UChi_00)
VSTORE(0,%r10,UChi_00)
VLOAD(1,%r8,CC)
ZLOAD(1,%r9,Chi_01,Z0)
ZMUL(Chi_01,Z0,CC,UChi_01,Z1)
ZEND1(UChi_01,Z1,Z0)
ZEND2(UChi_01,Z1,Z0)
VSTORE(1,%r10,UChi_01)
VLOAD(2,%r8,CC)
ZLOAD(2,%r9,Chi_02,Z0)
ZMUL(Chi_02,Z0,CC,UChi_02,Z1)
ZEND1(UChi_02,Z1,Z0)
ZEND2(UChi_02,Z1,Z0)
VSTORE(2,%r10,UChi_02)
VLOAD(3,%r8,CC)
ZLOAD(3,%r9,Chi_10,Z0)
ZMUL(Chi_10,Z0,CC,UChi_10,Z1)
ZEND1(UChi_10,Z1,Z0)
ZEND2(UChi_10,Z1,Z0)
VSTORE(3,%r10,UChi_10)
VLOAD(4,%r8,CC)
ZLOAD(4,%r9,Chi_11,Z0)
ZMUL(Chi_11,Z0,CC,UChi_11,Z1)
ZEND1(UChi_11,Z1,Z0)
ZEND2(UChi_11,Z1,Z0)
VSTORE(4,%r10,UChi_11)
VLOAD(5,%r8,CC)
ZLOAD(5,%r9,Chi_12,Z0)
ZMUL(Chi_12,Z0,CC,UChi_12,Z1)
ZEND1(UChi_12,Z1,Z0)
ZEND2(UChi_12,Z1,Z0)
VSTORE(5,%r10,UChi_12)
);
}
void TimesMinusIAvx512(void *ptr1,void *ptr3)
{
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
__asm__ ("kmovw %%eax, %%k6 " : : :);
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
__asm__ ("kmovw %%eax, %%k7 " : : :);
MASK_REGS;
LOAD_CHI(ptr1);
__asm__ (
VZERO(zz)
VTIMESMINUSI(Chi_00,UChi_00,zz)
VTIMESMINUSI(Chi_01,UChi_01,zz)
VTIMESMINUSI(Chi_02,UChi_02,zz)
VTIMESMINUSI(Chi_10,UChi_10,zz)
VTIMESMINUSI(Chi_11,UChi_11,zz)
VTIMESMINUSI(Chi_12,UChi_12,zz)
);
SAVE_UCHI(ptr3);
}
void TimesIAvx512(void *ptr1,void *ptr3)
{
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
__asm__ ("kmov %%eax, %%k6 " : : :);
__asm__ ("knot %%k6, %%k7 " : : :);
__asm__ ("kmovw %%eax, %%k6 " : : :);
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
__asm__ ("kmovw %%eax, %%k7 " : : :);
MASK_REGS;
@ -288,6 +443,68 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
void ZmulF(void *ptr1,void *ptr2,void *ptr3)
{
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
__asm__ ("kmovw %%eax, %%k6 " : : :);
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
__asm__ ("kmovw %%eax, %%k7 " : : :);
MASK_REGS;
ZLOAD(0,ptr1,Chi_00,Z0);
ZLOAD(1,ptr1,Chi_01,Z1);
ZLOAD(2,ptr1,Chi_02,Z2);
ZLOAD(3,ptr1,Chi_10,Z3);
ZLOAD(4,ptr1,Chi_11,Z4);
ZLOAD(5,ptr1,Chi_12,Z5);
VLOAD(0,ptr2,Chi_20);
VLOAD(1,ptr2,Chi_21);
VLOAD(2,ptr2,Chi_22);
VLOAD(3,ptr2,Chi_30);
VLOAD(4,ptr2,Chi_31);
VLOAD(5,ptr2,Chi_32);
ZMUL(Chi_00,Z0,Chi_20,UChi_00,UChi_20);
ZMUL(Chi_01,Z1,Chi_21,UChi_01,UChi_21);
ZMUL(Chi_02,Z2,Chi_22,UChi_02,UChi_22);
ZMUL(Chi_10,Z3,Chi_23,UChi_10,UChi_30);
ZMUL(Chi_11,Z4,Chi_24,UChi_11,UChi_31);
ZMUL(Chi_12,Z5,Chi_25,UChi_12,UChi_32);
ZEND1(UChi_00,UChi_20,Z0);
ZEND1(UChi_01,UChi_21,Z1);
ZEND1(UChi_02,UChi_22,Z2);
ZEND1(UChi_10,UChi_30,Z3);
ZEND1(UChi_11,UChi_31,Z4);
ZEND1(UChi_12,UChi_32,Z5);
ZEND2(UChi_00,UChi_20,Z0);
ZEND2(UChi_01,UChi_21,Z1);
ZEND2(UChi_02,UChi_22,Z2);
ZEND2(UChi_10,UChi_30,Z3);
ZEND2(UChi_11,UChi_31,Z4);
ZEND2(UChi_12,UChi_32,Z5);
SAVE_UCHI(ptr3);
}
void TimesMinusIAvx512F(void *ptr1,void *ptr3)
{
MASK_REGS;
LOAD_CHI(ptr1);
__asm__ (
VZERO(zz)
VTIMESMINUSI(Chi_00,UChi_00,zz)
VTIMESMINUSI(Chi_01,UChi_01,zz)
VTIMESMINUSI(Chi_02,UChi_02,zz)
VTIMESMINUSI(Chi_10,UChi_10,zz)
VTIMESMINUSI(Chi_11,UChi_11,zz)
VTIMESMINUSI(Chi_12,UChi_12,zz)
);
SAVE_UCHI(ptr3);
}
void TimesIAvx512F(void *ptr1,void *ptr3)
{
MASK_REGS;