mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
KNL streaming stores, and KNL performance coutners
This commit is contained in:
parent
2d4a45c758
commit
81f2aeaece
@ -251,11 +251,13 @@ int main (int argc, char ** argv)
|
|||||||
sr_o = zero;
|
sr_o = zero;
|
||||||
|
|
||||||
sDw.ZeroCounters();
|
sDw.ZeroCounters();
|
||||||
|
sDw.stat.init("DhopEO");
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for (int i = 0; i < ncall; i++) {
|
for (int i = 0; i < ncall; i++) {
|
||||||
sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
|
sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
|
sDw.stat.print();
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
double flops=(1344.0*volume*ncall)/2;
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
@ -51,7 +51,7 @@ int main (int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
const int Ls=16;
|
const int Ls=8;
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
14
configure.ac
14
configure.ac
@ -70,6 +70,20 @@ case ${ac_LAPACK} in
|
|||||||
AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
|
AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
################## first-touch ####################
|
||||||
|
AC_ARG_ENABLE([numa],
|
||||||
|
[AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
|
||||||
|
[ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
|
||||||
|
|
||||||
|
case ${ac_NUMA} in
|
||||||
|
no)
|
||||||
|
;;
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
|
||||||
|
*)
|
||||||
|
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
|
||||||
|
esac
|
||||||
|
|
||||||
################## FFTW3 ####################
|
################## FFTW3 ####################
|
||||||
AC_ARG_WITH([fftw],
|
AC_ARG_WITH([fftw],
|
||||||
[AS_HELP_STRING([--with-fftw=prefix],
|
[AS_HELP_STRING([--with-fftw=prefix],
|
||||||
|
@ -113,9 +113,8 @@ public:
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
_Tp tmp;
|
_Tp tmp;
|
||||||
#undef FIRST_TOUCH_OPTIMISE
|
#ifdef GRID_NUMA
|
||||||
#ifdef FIRST_TOUCH_OPTIMISE
|
#pragma omp parallel for schedule(static)
|
||||||
#pragma omp parallel for
|
|
||||||
for(int i=0;i<__n;i++){
|
for(int i=0;i<__n;i++){
|
||||||
ptr[i]=tmp;
|
ptr[i]=tmp;
|
||||||
}
|
}
|
||||||
|
233
lib/Stat.cc
Normal file
233
lib/Stat.cc
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
#include <Stat.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
|
||||||
|
bool PmuStat::pmu_initialized=false;
|
||||||
|
|
||||||
|
|
||||||
|
void PmuStat::init(const char *regname)
|
||||||
|
{
|
||||||
|
name = regname;
|
||||||
|
if (!pmu_initialized)
|
||||||
|
{
|
||||||
|
std::cout<<"initialising pmu"<<std::endl;
|
||||||
|
pmu_initialized = true;
|
||||||
|
pmu_init();
|
||||||
|
}
|
||||||
|
clear();
|
||||||
|
}
|
||||||
|
void PmuStat::clear(void)
|
||||||
|
{
|
||||||
|
count = 0;
|
||||||
|
tregion = 0;
|
||||||
|
pmc0 = 0;
|
||||||
|
pmc1 = 0;
|
||||||
|
inst = 0;
|
||||||
|
cyc = 0;
|
||||||
|
ref = 0;
|
||||||
|
tcycles = 0;
|
||||||
|
reads = 0;
|
||||||
|
writes = 0;
|
||||||
|
}
|
||||||
|
void PmuStat::print(void)
|
||||||
|
{
|
||||||
|
std::cout <<"Reg "<<std::string(name)<<":\n";
|
||||||
|
std::cout <<" region "<<tregion<<std::endl;
|
||||||
|
std::cout <<" cycles "<<tcycles<<std::endl;
|
||||||
|
std::cout <<" inst "<<inst <<std::endl;
|
||||||
|
std::cout <<" cyc "<<cyc <<std::endl;
|
||||||
|
std::cout <<" ref "<<ref <<std::endl;
|
||||||
|
std::cout <<" pmc0 "<<pmc0 <<std::endl;
|
||||||
|
std::cout <<" pmc1 "<<pmc1 <<std::endl;
|
||||||
|
std::cout <<" count "<<count <<std::endl;
|
||||||
|
std::cout <<" reads "<<reads <<std::endl;
|
||||||
|
std::cout <<" writes "<<writes <<std::endl;
|
||||||
|
}
|
||||||
|
void PmuStat::start(void)
|
||||||
|
{
|
||||||
|
pmu_start();
|
||||||
|
++count;
|
||||||
|
xmemctrs(&mrstart, &mwstart);
|
||||||
|
tstart = _rdtsc();
|
||||||
|
}
|
||||||
|
void PmuStat::enter(int t)
|
||||||
|
{
|
||||||
|
counters[0][t] = _rdpmc(0);
|
||||||
|
counters[1][t] = _rdpmc(1);
|
||||||
|
counters[2][t] = _rdpmc((1<<30)|0);
|
||||||
|
counters[3][t] = _rdpmc((1<<30)|1);
|
||||||
|
counters[4][t] = _rdpmc((1<<30)|2);
|
||||||
|
counters[5][t] = _rdtsc();
|
||||||
|
}
|
||||||
|
void PmuStat::exit(int t)
|
||||||
|
{
|
||||||
|
counters[0][t] = _rdpmc(0) - counters[0][t];
|
||||||
|
counters[1][t] = _rdpmc(1) - counters[1][t];
|
||||||
|
counters[2][t] = _rdpmc((1<<30)|0) - counters[2][t];
|
||||||
|
counters[3][t] = _rdpmc((1<<30)|1) - counters[3][t];
|
||||||
|
counters[4][t] = _rdpmc((1<<30)|2) - counters[4][t];
|
||||||
|
counters[5][t] = _rdtsc() - counters[5][t];
|
||||||
|
}
|
||||||
|
void PmuStat::accum(int nthreads)
|
||||||
|
{
|
||||||
|
tend = _rdtsc();
|
||||||
|
xmemctrs(&mrend, &mwend);
|
||||||
|
pmu_stop();
|
||||||
|
for (int t = 0; t < nthreads; ++t) {
|
||||||
|
pmc0 += counters[0][t];
|
||||||
|
pmc1 += counters[1][t];
|
||||||
|
inst += counters[2][t];
|
||||||
|
cyc += counters[3][t];
|
||||||
|
ref += counters[4][t];
|
||||||
|
tcycles += counters[5][t];
|
||||||
|
}
|
||||||
|
uint64_t region = tend - tstart;
|
||||||
|
tregion += region;
|
||||||
|
uint64_t mreads = mrend - mrstart;
|
||||||
|
reads += mreads;
|
||||||
|
uint64_t mwrites = mwend - mwstart;
|
||||||
|
writes += mwrites;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void PmuStat::pmu_fini(void) {}
|
||||||
|
void PmuStat::pmu_start(void) {};
|
||||||
|
void PmuStat::pmu_stop(void) {};
|
||||||
|
void PmuStat::pmu_init(void)
|
||||||
|
{
|
||||||
|
#ifdef _KNIGHTS_LANDING_
|
||||||
|
KNLsetup();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
|
||||||
|
{
|
||||||
|
#ifdef _KNIGHTS_LANDING_
|
||||||
|
ctrs c;
|
||||||
|
KNLreadctrs(c);
|
||||||
|
uint64_t emr = 0, emw = 0;
|
||||||
|
for (int i = 0; i < NEDC; ++i)
|
||||||
|
{
|
||||||
|
emr += c.edcrd[i];
|
||||||
|
emw += c.edcwr[i];
|
||||||
|
}
|
||||||
|
*mr = emr;
|
||||||
|
*mw = emw;
|
||||||
|
#else
|
||||||
|
*mr = *mw = 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _KNIGHTS_LANDING_
|
||||||
|
|
||||||
|
struct knl_gbl_ PmuStat::gbl;
|
||||||
|
|
||||||
|
#define PMU_MEM
|
||||||
|
|
||||||
|
void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
|
||||||
|
{
|
||||||
|
char fname[1024];
|
||||||
|
snprintf(fname, sizeof(fname), "%s/type", ename);
|
||||||
|
FILE *fp = fopen(fname, "r");
|
||||||
|
if (fp == 0) {
|
||||||
|
::printf("open %s", fname);
|
||||||
|
::exit(0);
|
||||||
|
}
|
||||||
|
int type;
|
||||||
|
int ret = fscanf(fp, "%d", &type);
|
||||||
|
assert(ret == 1);
|
||||||
|
fclose(fp);
|
||||||
|
// std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
|
||||||
|
|
||||||
|
struct perf_event_attr hw = {};
|
||||||
|
hw.size = sizeof(hw);
|
||||||
|
hw.type = type;
|
||||||
|
// see /sys/devices/uncore_*/format/*
|
||||||
|
// All of the events we are interested in are configured the same way, but
|
||||||
|
// that isn't always true. Proper code would parse the format files
|
||||||
|
hw.config = event | (umask << 8);
|
||||||
|
//hw.read_format = PERF_FORMAT_GROUP;
|
||||||
|
// unfortunately the above only works within a single PMU; might
|
||||||
|
// as well just read them one at a time
|
||||||
|
int cpu = 0;
|
||||||
|
fd = perf_event_open(&hw, -1, cpu, -1, 0);
|
||||||
|
if (fd == -1) {
|
||||||
|
::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
|
||||||
|
::exit(0);
|
||||||
|
} else {
|
||||||
|
// std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void PmuStat::KNLsetup(void){
|
||||||
|
|
||||||
|
int ret;
|
||||||
|
char fname[1024];
|
||||||
|
|
||||||
|
// MC RPQ inserts and WPQ inserts (reads & writes)
|
||||||
|
for (int mc = 0; mc < NMC; ++mc)
|
||||||
|
{
|
||||||
|
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
|
||||||
|
// RPQ Inserts
|
||||||
|
KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
|
||||||
|
// WPQ Inserts
|
||||||
|
KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
|
||||||
|
}
|
||||||
|
// EDC RPQ inserts and WPQ inserts
|
||||||
|
for (int edc=0; edc < NEDC; ++edc)
|
||||||
|
{
|
||||||
|
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
|
||||||
|
// RPQ inserts
|
||||||
|
KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
|
||||||
|
// WPQ inserts
|
||||||
|
KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
|
||||||
|
}
|
||||||
|
// EDC HitE, HitM, MissE, MissM
|
||||||
|
for (int edc=0; edc < NEDC; ++edc)
|
||||||
|
{
|
||||||
|
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
|
||||||
|
KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
|
||||||
|
KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
|
||||||
|
KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
|
||||||
|
KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t PmuStat::KNLreadctr(int fd)
|
||||||
|
{
|
||||||
|
uint64_t data;
|
||||||
|
size_t s = ::read(fd, &data, sizeof(data));
|
||||||
|
if (s != sizeof(uint64_t)){
|
||||||
|
::printf("read counter %lu", s);
|
||||||
|
::exit(0);
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PmuStat::KNLreadctrs(ctrs &c)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < NMC; ++i)
|
||||||
|
{
|
||||||
|
c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
|
||||||
|
c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < NEDC; ++i)
|
||||||
|
{
|
||||||
|
c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
|
||||||
|
c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < NEDC; ++i)
|
||||||
|
{
|
||||||
|
c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
|
||||||
|
c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
|
||||||
|
c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
|
||||||
|
c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
100
lib/Stat.h
Normal file
100
lib/Stat.h
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
#ifndef _GRID_STAT_H
|
||||||
|
#define _GRID_STAT_H
|
||||||
|
|
||||||
|
#ifdef AVX512
|
||||||
|
#define _KNIGHTS_LANDING_
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _KNIGHTS_LANDING_
|
||||||
|
|
||||||
|
#define NMC 6
|
||||||
|
#define NEDC 8
|
||||||
|
namespace Grid {
|
||||||
|
struct ctrs
|
||||||
|
{
|
||||||
|
uint64_t mcrd[NMC];
|
||||||
|
uint64_t mcwr[NMC];
|
||||||
|
uint64_t edcrd[NEDC];
|
||||||
|
uint64_t edcwr[NEDC];
|
||||||
|
uint64_t edchite[NEDC];
|
||||||
|
uint64_t edchitm[NEDC];
|
||||||
|
uint64_t edcmisse[NEDC];
|
||||||
|
uint64_t edcmissm[NEDC];
|
||||||
|
};
|
||||||
|
// Peter/Azusa:
|
||||||
|
// Our modification of a code provided by Larry Meadows from Intel
|
||||||
|
// Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
|
||||||
|
// so is already public and in the linux kernel for KNL.
|
||||||
|
struct knl_gbl_
|
||||||
|
{
|
||||||
|
int mc_rd[NMC];
|
||||||
|
int mc_wr[NMC];
|
||||||
|
int edc_rd[NEDC];
|
||||||
|
int edc_wr[NEDC];
|
||||||
|
int edc_hite[NEDC];
|
||||||
|
int edc_hitm[NEDC];
|
||||||
|
int edc_misse[NEDC];
|
||||||
|
int edc_missm[NEDC];
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class PmuStat
|
||||||
|
{
|
||||||
|
const char *name;
|
||||||
|
__declspec(align(64)) uint64_t counters[8][256];
|
||||||
|
#ifdef _KNIGHTS_LANDING_
|
||||||
|
static struct knl_gbl_ gbl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint64_t reads; // memory reads
|
||||||
|
uint64_t writes; // memory writes
|
||||||
|
uint64_t mrstart; // memory read counter at start of parallel region
|
||||||
|
uint64_t mrend; // memory read counter at end of parallel region
|
||||||
|
uint64_t mwstart; // memory write counter at start of parallel region
|
||||||
|
uint64_t mwend; // memory write counter at end of parallel region
|
||||||
|
|
||||||
|
// cumulative counters
|
||||||
|
uint64_t count; // number of invocations
|
||||||
|
uint64_t tregion; // total time in parallel region (from thread 0)
|
||||||
|
uint64_t tcycles; // total cycles inside parallel region
|
||||||
|
uint64_t inst, ref, cyc; // fixed counters
|
||||||
|
uint64_t pmc0, pmc1;// pmu
|
||||||
|
// add memory counters here
|
||||||
|
// temp variables
|
||||||
|
uint64_t tstart; // tsc at start of parallel region
|
||||||
|
uint64_t tend; // tsc at end of parallel region
|
||||||
|
// map for ctrs values
|
||||||
|
// 0 pmc0 start
|
||||||
|
// 1 pmc0 end
|
||||||
|
// 2 pmc1 start
|
||||||
|
// 3 pmc1 end
|
||||||
|
// 4 tsc start
|
||||||
|
// 5 tsc end
|
||||||
|
static bool pmu_initialized;
|
||||||
|
public:
|
||||||
|
static bool is_init(void){ return pmu_initialized;}
|
||||||
|
static void pmu_init(void);
|
||||||
|
static void pmu_fini(void);
|
||||||
|
static void pmu_start(void);
|
||||||
|
static void pmu_stop(void);
|
||||||
|
void accum(int nthreads);
|
||||||
|
static void xmemctrs(uint64_t *mr, uint64_t *mw);
|
||||||
|
void start(void);
|
||||||
|
void enter(int t);
|
||||||
|
void exit(int t);
|
||||||
|
void print(void);
|
||||||
|
void init(const char *regname);
|
||||||
|
void clear(void);
|
||||||
|
#ifdef _KNIGHTS_LANDING_
|
||||||
|
static void KNLsetup(void);
|
||||||
|
static uint64_t KNLreadctr(int fd);
|
||||||
|
static void KNLreadctrs(ctrs &c);
|
||||||
|
static void KNLevsetup(const char *ename, int &fd, int event, int umask);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
@ -37,7 +37,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ")
|
#ifdef GRID_NUMA
|
||||||
|
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
|
||||||
|
#else
|
||||||
|
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
|
||||||
|
#endif
|
||||||
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
||||||
#else
|
#else
|
||||||
#define PARALLEL_FOR_LOOP
|
#define PARALLEL_FOR_LOOP
|
||||||
|
@ -178,7 +178,7 @@ public:
|
|||||||
// all elements of a simd vector must have same checkerboard.
|
// all elements of a simd vector must have same checkerboard.
|
||||||
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
|
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
|
||||||
if ( _simd_layout[d]>1 ) {
|
if ( _simd_layout[d]>1 ) {
|
||||||
if ( d != _checker_dim ) {
|
if ( checker_dim_mask[d] ) {
|
||||||
assert( (_rdimensions[d]&0x1) == 0 );
|
assert( (_rdimensions[d]&0x1) == 0 );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -416,6 +416,28 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
|||||||
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
|
Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
|
||||||
out);
|
out);
|
||||||
}
|
}
|
||||||
|
#ifdef AVX512
|
||||||
|
} else if (stat.is_init() ) {
|
||||||
|
|
||||||
|
int nthreads;
|
||||||
|
stat.start();
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
#pragma omp master
|
||||||
|
nthreads = omp_get_num_threads();
|
||||||
|
int mythread = omp_get_thread_num();
|
||||||
|
stat.enter(mythread);
|
||||||
|
#pragma omp for nowait
|
||||||
|
for(int ss=0;ss<U._grid->oSites();ss++)
|
||||||
|
{
|
||||||
|
int sU=ss;
|
||||||
|
int sF=LLs*sU;
|
||||||
|
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
|
}
|
||||||
|
stat.exit(mythread);
|
||||||
|
}
|
||||||
|
stat.accum(nthreads);
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
|
@ -31,6 +31,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_QCD_WILSON_FERMION_5D_H
|
#ifndef GRID_QCD_WILSON_FERMION_5D_H
|
||||||
#define GRID_QCD_WILSON_FERMION_5D_H
|
#define GRID_QCD_WILSON_FERMION_5D_H
|
||||||
|
|
||||||
|
#include <Grid/Stat.h>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
@ -60,6 +62,7 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
|
PmuStat stat;
|
||||||
|
|
||||||
void Report(void);
|
void Report(void);
|
||||||
void ZeroCounters(void);
|
void ZeroCounters(void);
|
||||||
|
@ -134,7 +134,9 @@
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Xm
|
// Xm
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
|
#ifndef STREAM_STORE
|
||||||
basep= (uint64_t) &out._odata[ss];
|
basep= (uint64_t) &out._odata[ss];
|
||||||
|
#endif
|
||||||
// basep= st.GetPFInfo(nent,plocal); nent++;
|
// basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
if ( local ) {
|
if ( local ) {
|
||||||
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
@ -229,7 +231,9 @@
|
|||||||
LOAD_CHI(base);
|
LOAD_CHI(base);
|
||||||
}
|
}
|
||||||
base= (uint64_t) &out._odata[ss];
|
base= (uint64_t) &out._odata[ss];
|
||||||
|
#ifndef STREAM_STORE
|
||||||
PREFETCH_CHIMU(base);
|
PREFETCH_CHIMU(base);
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
MULT_2SPIN_DIR_PFTM(Tm,basep);
|
MULT_2SPIN_DIR_PFTM(Tm,basep);
|
||||||
}
|
}
|
||||||
|
@ -138,9 +138,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||||
|
|
||||||
|
#define STREAM_STORE
|
||||||
|
#ifdef STREAM_STORE
|
||||||
|
#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
#else
|
||||||
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
#endif
|
||||||
|
|
||||||
// Swaps Re/Im ; could unify this with IMCI
|
// Swaps Re/Im ; could unify this with IMCI
|
||||||
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
|
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
|
||||||
|
Loading…
Reference in New Issue
Block a user