1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00
This commit is contained in:
paboyle 2016-06-25 11:08:05 -07:00
parent 51cb2d4328
commit 2d8bb4c594
5 changed files with 242 additions and 51 deletions

View File

@ -68,10 +68,12 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
int Lmax=32;
int dmin=0;
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
for (int L=8;L<Lmax;L*=2){
if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
for (int L=8;L<=Lmax;L*=2){
std::vector<int> latt4(4,L);
for(int d=4;d>0;d--){
for(int d=4;d>dmin;d--){
if ( d<=3 ) latt4[d]*=2;
std::cout << GridLogMessage <<"\t";
for(int d=0;d<Nd;d++){
@ -170,7 +172,11 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
Dw.Dhop(src,result,0);
double t1=usecond();
#ifdef TIMERS_OFF
int ncall =10;
#else
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
#endif
if (ncall < 5 ) exit(0);
@ -297,7 +303,11 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
sDw.Dhop(ssrc,sresult,0);
double t1=usecond();
#ifdef TIMERS_OFF
int ncall =10;
#else
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
#endif
PerformanceCounter Counter(8);
Counter.Start();
@ -340,7 +350,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
CounterSdw.Start();
t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
__SSC_STOP;
}
t1=usecond();
CounterSdw.Stop();

View File

@ -67,9 +67,12 @@ int setupSigns(void ){
}
static int signInit = setupSigns();
#define label(A) ilabel(A)
#define ilabel(A) ".globl\n" #A ":\n"
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define FX(A) WILSONASM_ ##A
template<>
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
@ -80,6 +83,8 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#undef FX
#define FX(A) DWFASM_ ## A
#define MAYBEPERM(A,B)
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)

View File

@ -1,8 +1,7 @@
{
int locala,perma, ptypea;
int localb,permb, ptypeb;
int localc,permc, ptypec;
uint64_t basea, baseb, basec;
uint64_t basea, baseb;
uint64_t basex;
const uint64_t plocal =(uint64_t) & in._odata[0];
@ -12,22 +11,15 @@
MASK_REGS;
for(int site=0;site<Ns;site++) {
int sU=lo.Reorder(ssU);
int sU=lo.Reorder(ssU);
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
ss=sU*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
basex = basea;
if ( locala ) {
@ -47,7 +39,6 @@
// Yp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb);
@ -56,7 +47,7 @@
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYP(Yp,basec);
MULT_2SPIN_DIR_PFYP(Yp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM;
@ -65,16 +56,15 @@
// Zp
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
if ( localc ) {
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR1,permc);
ZM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basec);
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZP(Zp,basea);
MULT_2SPIN_DIR_PFZP(Zp,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM;
@ -82,17 +72,16 @@
////////////////////////////////
// Tp
////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
if ( locala ) {
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR0,perma);
TM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(basea);
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTP(Tp,baseb);
MULT_2SPIN_DIR_PFTP(Tp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM;
@ -100,17 +89,16 @@
////////////////////////////////
// Xm
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
if ( localb ) {
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR3,permb);
XP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(baseb);
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXM(Xm,basec);
MULT_2SPIN_DIR_PFXM(Xm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM;
@ -118,14 +106,13 @@
////////////////////////////////
// Ym
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
if ( localc ) {
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR2,permc);
YP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(basec);
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYM(Ym,basea);
@ -136,8 +123,7 @@
////////////////////////////////
// Zm
////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea);
@ -155,7 +141,6 @@
// Tm
////////////////////////////////
basea = (uint64_t)&out._odata[ss];
PREFETCH_CHIMU(basea);
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb);
@ -163,16 +148,16 @@
} else {
LOAD_CHI(baseb);
}
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal);
{
MULT_2SPIN_DIR_PFTM(Tm,basec);
MULT_2SPIN_DIR_PFTM(Tm,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;
// PREFETCH_CHIMU(basex);
SAVE_RESULT(&out._odata[ss]);
}
SAVE_RESULT(&out._odata[ss],baseb);
}
ssU++;
}
}

View File

@ -0,0 +1,187 @@
{
int locala,perma, ptypea;
int localb,permb, ptypeb;
int localc,permc, ptypec;
uint64_t basea, baseb, basec;
uint64_t basex;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0];
MASK_REGS;
for(int site=0;site<Ns;site++) {
int sU=lo.Reorder(ssU);
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
basex = basea;
label(FX(XP) );
if ( locala ) {
LOAD64(%r10,isigns);
XM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXP(Xp,baseb);
}
LOAD64(%r10,isigns);
XM_RECON;
////////////////////////////////
// Yp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
label(FX(YP) );
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYP(Yp,basec);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM;
////////////////////////////////
// Zp
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
label(FX(ZP) );
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR1,permc);
} else {
LOAD_CHI(basec);
}
{
MULT_2SPIN_DIR_PFZP(Zp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM;
////////////////////////////////
// Tp
////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
label(FX(TP) );
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR0,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFTP(Tp,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM;
////////////////////////////////
// Xm
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
label(FX(XM) );
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR3,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFXM(Xm,basec);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM;
////////////////////////////////
// Ym
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
label(FX(YM) );
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR2,permc);
} else {
LOAD_CHI(basec);
}
{
MULT_2SPIN_DIR_PFYM(Ym,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_RECON_ACCUM;
////////////////////////////////
// Zm
////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
label(FX(ZM) );
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZM(Zm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_RECON_ACCUM;
////////////////////////////////
// Tm
////////////////////////////////
basea = (uint64_t)&out._odata[ss];
PREFETCH_CHIMU(basea);
label(FX(TM) );
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTM(Tm,basec);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;
// PREFETCH_CHIMU(basex);
label(FX(SAV) );
SAVE_RESULT(&out._odata[ss]);
}
ssU++;
}
}

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -37,6 +37,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
"mov $0x5555, %%eax \n"\
"kmovw %%eax, %%k7 \n" : : : "%eax");
//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" );
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"