1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 19:25:56 +01:00
This commit is contained in:
paboyle 2016-06-25 11:08:05 -07:00
parent 51cb2d4328
commit 2d8bb4c594
5 changed files with 242 additions and 51 deletions

View File

@ -68,10 +68,12 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "=========================================================================="<<std::endl; std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
int Lmax=32; int Lmax=32;
int dmin=0;
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX")); if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
for (int L=8;L<Lmax;L*=2){ if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
for (int L=8;L<=Lmax;L*=2){
std::vector<int> latt4(4,L); std::vector<int> latt4(4,L);
for(int d=4;d>0;d--){ for(int d=4;d>dmin;d--){
if ( d<=3 ) latt4[d]*=2; if ( d<=3 ) latt4[d]*=2;
std::cout << GridLogMessage <<"\t"; std::cout << GridLogMessage <<"\t";
for(int d=0;d<Nd;d++){ for(int d=0;d<Nd;d++){
@ -170,7 +172,11 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
Dw.Dhop(src,result,0); Dw.Dhop(src,result,0);
double t1=usecond(); double t1=usecond();
#ifdef TIMERS_OFF
int ncall =10;
#else
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
#endif
if (ncall < 5 ) exit(0); if (ncall < 5 ) exit(0);
@ -297,7 +303,11 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
sDw.Dhop(ssrc,sresult,0); sDw.Dhop(ssrc,sresult,0);
double t1=usecond(); double t1=usecond();
#ifdef TIMERS_OFF
int ncall =10;
#else
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0)); int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
#endif
PerformanceCounter Counter(8); PerformanceCounter Counter(8);
Counter.Start(); Counter.Start();
@ -340,7 +350,9 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
CounterSdw.Start(); CounterSdw.Start();
t0=usecond(); t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
__SSC_START;
sDw.DhopEO(ssrc_o,sr_e,DaggerNo); sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
__SSC_STOP;
} }
t1=usecond(); t1=usecond();
CounterSdw.Stop(); CounterSdw.Stop();

View File

@ -67,9 +67,12 @@ int setupSigns(void ){
} }
static int signInit = setupSigns(); static int signInit = setupSigns();
#define label(A) ilabel(A)
#define ilabel(A) ".globl\n" #A ":\n"
#define MAYBEPERM(A,perm) if (perm) { A ; } #define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define FX(A) WILSONASM_ ##A
template<> template<>
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
@ -80,6 +83,8 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd
#undef VMOVRDUP #undef VMOVRDUP
#undef MAYBEPERM #undef MAYBEPERM
#undef MULT_2SPIN #undef MULT_2SPIN
#undef FX
#define FX(A) DWFASM_ ## A
#define MAYBEPERM(A,B) #define MAYBEPERM(A,B)
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) #define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)

View File

@ -1,8 +1,7 @@
{ {
int locala,perma, ptypea; int locala,perma, ptypea;
int localb,permb, ptypeb; int localb,permb, ptypeb;
int localc,permc, ptypec; uint64_t basea, baseb;
uint64_t basea, baseb, basec;
uint64_t basex; uint64_t basex;
const uint64_t plocal =(uint64_t) & in._odata[0]; const uint64_t plocal =(uint64_t) & in._odata[0];
@ -12,22 +11,15 @@
MASK_REGS; MASK_REGS;
for(int site=0;site<Ns;site++) { for(int site=0;site<Ns;site++) {
int sU=lo.Reorder(ssU); int sU=lo.Reorder(ssU);
for(int s=0;s<Ls;s++) { for(int s=0;s<Ls;s++) {
ss =sU*Ls+s; ss=sU*Ls+s;
//////////////////////////////// ////////////////////////////////
// Xp // Xp
//////////////////////////////// ////////////////////////////////
int ent=ss*8;// 2*Ndim int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
basex = basea; basex = basea;
if ( locala ) { if ( locala ) {
@ -47,7 +39,6 @@
// Yp // Yp
//////////////////////////////// ////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++; basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
if ( localb ) { if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb); YM_PROJMEM(baseb);
@ -56,7 +47,7 @@
LOAD_CHI(baseb); LOAD_CHI(baseb);
} }
{ {
MULT_2SPIN_DIR_PFYP(Yp,basec); MULT_2SPIN_DIR_PFYP(Yp,basea);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM; YM_RECON_ACCUM;
@ -65,16 +56,15 @@
// Zp // Zp
//////////////////////////////// ////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb); if ( locala ) {
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basec); ZM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,permc); MAYBEPERM(PERMUTE_DIR1,perma);
} else { } else {
LOAD_CHI(basec); LOAD_CHI(basea);
} }
{ {
MULT_2SPIN_DIR_PFZP(Zp,basea); MULT_2SPIN_DIR_PFZP(Zp,baseb);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM; ZM_RECON_ACCUM;
@ -82,17 +72,16 @@
//////////////////////////////// ////////////////////////////////
// Tp // Tp
//////////////////////////////// ////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++; basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basec); if ( localb ) {
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(basea); TM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,perma); MAYBEPERM(PERMUTE_DIR0,permb);
} else { } else {
LOAD_CHI(basea); LOAD_CHI(baseb);
} }
{ {
MULT_2SPIN_DIR_PFTP(Tp,baseb); MULT_2SPIN_DIR_PFTP(Tp,basea);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM; TM_RECON_ACCUM;
@ -100,17 +89,16 @@
//////////////////////////////// ////////////////////////////////
// Xm // Xm
//////////////////////////////// ////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basea); if ( locala ) {
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(baseb); XP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,permb); MAYBEPERM(PERMUTE_DIR3,perma);
} else { } else {
LOAD_CHI(baseb); LOAD_CHI(basea);
} }
{ {
MULT_2SPIN_DIR_PFXM(Xm,basec); MULT_2SPIN_DIR_PFXM(Xm,baseb);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM; XP_RECON_ACCUM;
@ -118,14 +106,13 @@
//////////////////////////////// ////////////////////////////////
// Ym // Ym
//////////////////////////////// ////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++; basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb); if ( localb ) {
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(basec); YP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permc); MAYBEPERM(PERMUTE_DIR2,permb);
} else { } else {
LOAD_CHI(basec); LOAD_CHI(baseb);
} }
{ {
MULT_2SPIN_DIR_PFYM(Ym,basea); MULT_2SPIN_DIR_PFYM(Ym,basea);
@ -136,8 +123,7 @@
//////////////////////////////// ////////////////////////////////
// Zm // Zm
//////////////////////////////// ////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++; baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
if ( locala ) { if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea); ZP_PROJMEM(basea);
@ -155,7 +141,6 @@
// Tm // Tm
//////////////////////////////// ////////////////////////////////
basea = (uint64_t)&out._odata[ss]; basea = (uint64_t)&out._odata[ss];
PREFETCH_CHIMU(basea);
if ( localb ) { if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb); TP_PROJMEM(baseb);
@ -163,16 +148,16 @@
} else { } else {
LOAD_CHI(baseb); LOAD_CHI(baseb);
} }
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal);
{ {
MULT_2SPIN_DIR_PFTM(Tm,basec); MULT_2SPIN_DIR_PFTM(Tm,basea);
} }
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM; TP_RECON_ACCUM;
// PREFETCH_CHIMU(basex); SAVE_RESULT(&out._odata[ss],baseb);
SAVE_RESULT(&out._odata[ss]);
}
}
ssU++; ssU++;
} }
} }

View File

@ -0,0 +1,187 @@
{
int locala,perma, ptypea;
int localb,permb, ptypeb;
int localc,permc, ptypec;
uint64_t basea, baseb, basec;
uint64_t basex;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0];
MASK_REGS;
for(int site=0;site<Ns;site++) {
int sU=lo.Reorder(ssU);
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
basex = basea;
label(FX(XP) );
if ( locala ) {
LOAD64(%r10,isigns);
XM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXP(Xp,baseb);
}
LOAD64(%r10,isigns);
XM_RECON;
////////////////////////////////
// Yp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
label(FX(YP) );
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYP(Yp,basec);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM;
////////////////////////////////
// Zp
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
label(FX(ZP) );
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR1,permc);
} else {
LOAD_CHI(basec);
}
{
MULT_2SPIN_DIR_PFZP(Zp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM;
////////////////////////////////
// Tp
////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
label(FX(TP) );
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR0,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFTP(Tp,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM;
////////////////////////////////
// Xm
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basea);
label(FX(XM) );
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR3,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFXM(Xm,basec);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM;
////////////////////////////////
// Ym
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
PREFETCH_CHIMU(baseb);
label(FX(YM) );
if ( localc ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(basec);
MAYBEPERM(PERMUTE_DIR2,permc);
} else {
LOAD_CHI(basec);
}
{
MULT_2SPIN_DIR_PFYM(Ym,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_RECON_ACCUM;
////////////////////////////////
// Zm
////////////////////////////////
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
PREFETCH_CHIMU(basec);
label(FX(ZM) );
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZM(Zm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_RECON_ACCUM;
////////////////////////////////
// Tm
////////////////////////////////
basea = (uint64_t)&out._odata[ss];
PREFETCH_CHIMU(basea);
label(FX(TM) );
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTM(Tm,basec);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;
// PREFETCH_CHIMU(basex);
label(FX(SAV) );
SAVE_RESULT(&out._odata[ss]);
}
ssU++;
}
}

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -37,6 +37,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
"mov $0x5555, %%eax \n"\ "mov $0x5555, %%eax \n"\
"kmovw %%eax, %%k7 \n" : : : "%eax"); "kmovw %%eax, %%k7 \n" : : : "%eax");
//#define label(B) __asm__ ( __func__ __LINE__ #B ":\n" );
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n" #define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"