1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-09 23:45:36 +00:00

Large change with KNL preparation

This commit is contained in:
paboyle 2016-06-03 03:24:26 -07:00
parent 1c0e922585
commit 139cc5f1ae
26 changed files with 1810 additions and 1705 deletions

View File

@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#include <PerfCount.h>
using namespace std;
using namespace Grid;
@ -45,6 +46,10 @@ struct scal {
};
bool overlapComms = false;
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
int main (int argc, char ** argv)
{
@ -58,12 +63,17 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=8;
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
@ -78,7 +88,9 @@ int main (int argc, char ** argv)
ColourMatrix cm = Complex(1.0,0.0);
LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
@ -119,11 +131,16 @@ int main (int argc, char ** argv)
RealD NP = UGrid->_Nprocessors;
for(int doasm=1;doasm<2;doasm++){
QCD::WilsonKernelsStatic::AsmOpt=doasm;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall=100;
{
int ncall =50;
if (0) {
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
@ -140,10 +157,83 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Dw.Report();
// Dw.Report();
}
exit(0);
if (1)
{
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
LatticeFermionF ssrc(sFGrid);
LatticeFermionF sref(sFGrid);
LatticeFermionF sresult(sFGrid);
WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5,params);
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVectorF tmp;
peekSite(tmp,src,site);
pokeSite(tmp,ssrc,site);
}}}}}
double t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
sDw.Dhop(ssrc,sresult,0);
__SSC_STOP;
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
// sDw.Report();
if(1){
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
sDw.Dhop(ssrc,sresult,0);
PerformanceCounter Counter(i);
Counter.Start();
sDw.Dhop(ssrc,sresult,0);
Counter.Stop();
Counter.Report();
}
}
if(0){
std::cout<<GridLogMessage << " Cycle reporting "<<std::endl;
QCD::WilsonFermion5DStatic::CycleReport=1;
for(int i=0;i<ncall;i++){
sDw.Dhop(ssrc,sresult,0);
}
QCD::WilsonFermion5DStatic::CycleReport=0;
}
RealF sum=0;
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVectorF normal, simd;
peekSite(normal,result,site);
peekSite(simd,sresult,site);
sum=sum+norm2(normal-simd);
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
}}}}}
std::cout<<" difference between normal and simd is "<<sum<<std::endl;
}
if (1)
{ // Naive wilson dag implementation
@ -217,5 +307,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
}
Grid_finalize();
}

View File

@ -119,7 +119,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
mfc = flops*ncall/(t1-t0);
std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s = "<< mfc<<std::endl;
QCD::WilsonFermion5DStatic::AsmOptDslash=1;
QCD::WilsonKernelsStatic::AsmOpt=1;
t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopOE(srce,resulta,0);

View File

@ -55,6 +55,15 @@ echo :::::::::::::::::::::::::::::::::::::::::::
AC_CHECK_FUNCS([gettimeofday])
#AC_CHECK_LIB([gmp],[__gmpf_init],,
# [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
#Please install or provide the correct path to your installation
#Info at: http://www.gmplib.org)])
#AC_CHECK_LIB([mpfr],[mpfr_init],,
# [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
#Please install or provide the correct path to your installation
#Info at: http://www.mpfr.org/)])
#
# SIMD instructions selection
@ -199,6 +208,25 @@ case ${ac_RNG} in
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
;;
esac
#
# SDE timing mode
#
AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|mp],\
[Enable system dependent high res timers])],\
[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
case ${ac_TIMERS} in
yes)
AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
;;
no)
AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
;;
*)
AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
;;
esac
#
# Chroma regression tests
#

View File

@ -211,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
Grid_quiesce_nodes();
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
QCD::WilsonFermionStatic::HandOptDslash=1;
QCD::WilsonFermion5DStatic::HandOptDslash=1;
QCD::WilsonKernelsStatic::HandOpt=1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1;
@ -276,11 +275,6 @@ void Grid_finalize(void)
Grid_unquiesce_nodes();
#endif
}
double usecond(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
void * Grid_backtrace_buffer[_NBACKTRACE];

View File

@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
#ifdef __linux__
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." },
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." },
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." },
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." },
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......"},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS...."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS....."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS..."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS.."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS"},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS......."},
// { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS....."},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......"},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS...."},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS..."},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......"},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS...."}
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4
#ifdef AVX512
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
// 11
#else
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
// 11
#endif
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
//15
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS}
//19
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif
};
}

View File

@ -58,6 +58,28 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
}
#endif
#ifdef TIMERS_OFF
#warning PerfCount: Disabling cycle timers
inline uint64_t cyclecount(void){
return 0;
}
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
#define __SSC_STOP __SSC_MARK(0x110)
#define __SSC_START __SSC_MARK(0x111)
#else
#define __SSC_MARK(mark)
#define __SSC_STOP
#define __SSC_START
/*
* cycle counters arch dependent
*/
#ifdef __bgq__
inline uint64_t cyclecount(void){
uint64_t tmp;
@ -65,18 +87,21 @@ inline uint64_t cyclecount(void){
return tmp;
}
#elif defined __x86_64__
#include <immintrin.h>
#ifndef __INTEL_COMPILER
#include <x86intrin.h>
#endif
inline uint64_t cyclecount(void){
return __rdtsc();
inline uint64_t cyclecount(void){
return __rdtsc();
// unsigned int dummy;
// return __rdtscp(&dummy);
}
#else
#warning No cycle counter implemented for this architecture
inline uint64_t cyclecount(void){
return 0;
}
#endif
#endif
class PerformanceCounter {
@ -87,6 +112,7 @@ private:
uint32_t type;
uint64_t config;
const char *name;
int normalisation;
} PerformanceCounterConfig;
static const PerformanceCounterConfig PerformanceCounterConfigs [];
@ -94,26 +120,12 @@ private:
public:
enum PerformanceCounterType {
CPUCYCLES=0,
INSTRUCTIONS,
// STALL_CYCLES,
CACHE_REFERENCES,
CACHE_MISSES,
L1D_READ_MISS,
L1D_READ_ACCESS,
L1D_WRITE_MISS,
L1D_WRITE_ACCESS,
L1D_PREFETCH_MISS,
L1D_PREFETCH_ACCESS,
LL_READ_MISS,
// LL_READ_ACCESS,
LL_WRITE_MISS,
LL_WRITE_ACCESS,
LL_PREFETCH_MISS,
LL_PREFETCH_ACCESS,
L1I_READ_MISS,
L1I_READ_ACCESS,
PERFORMANCE_COUNTER_NUM_TYPES
CACHE_REFERENCES=0,
CACHE_MISSES=1,
CPUCYCLES=2,
INSTRUCTIONS=3,
L1D_READ_ACCESS=4,
PERFORMANCE_COUNTER_NUM_TYPES=19
};
public:
@ -121,7 +133,9 @@ public:
int PCT;
long long count;
long long cycles;
int fd;
int cyclefd;
unsigned long long elapsed;
uint64_t begin;
@ -134,7 +148,9 @@ public:
assert(_pct>=0);
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
fd=-1;
cyclefd=-1;
count=0;
cycles=0;
PCT =_pct;
Open();
#endif
@ -159,6 +175,15 @@ public:
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
perror("Error is");
}
int norm = PerformanceCounterConfigs[PCT].normalisation;
pe.type = PerformanceCounterConfigs[norm].type;
pe.config= PerformanceCounterConfigs[norm].config;
name = PerformanceCounterConfigs[norm].name;
cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
if (cyclefd == -1) {
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
perror("Error is");
}
#endif
}
@ -168,6 +193,8 @@ public:
if ( fd!= -1) {
::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
}
begin =cyclecount();
#else
@ -177,10 +204,13 @@ public:
void Stop(void) {
count=0;
cycles=0;
#ifdef __linux__
if ( fd!= -1) {
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
::read(fd, &count, sizeof(long long));
::read(cyclefd, &cycles, sizeof(long long));
}
elapsed = cyclecount() - begin;
#else
@ -190,7 +220,11 @@ public:
}
void Report(void) {
#ifdef __linux__
std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
int N = PerformanceCounterConfigs[PCT].normalisation;
const char * sn = PerformanceCounterConfigs[N].name ;
const char * sc = PerformanceCounterConfigs[PCT].name;
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
sc, count, sc,sn, (double)count/(double)cycles);
#else
std::printf("%llu cycles \n", elapsed );
#endif
@ -199,7 +233,7 @@ public:
~PerformanceCounter()
{
#ifdef __linux__
::close(fd);
::close(fd); ::close(cyclefd);
#endif
}

File diff suppressed because it is too large Load Diff

View File

@ -35,11 +35,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid {
#ifdef TIMERS_OFF
#warning Timer.h Disabling timers
#endif
// Dress the output; use std::chrono
// C++11 time facilities better?
double usecond(void);
inline double usecond(void) {
struct timeval tv;
#ifdef TIMERS_ON
gettimeofday(&tv,NULL);
#endif
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
typedef std::chrono::system_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint;
@ -63,17 +72,23 @@ public:
}
void Start(void) {
assert(running == false);
#ifdef TIMERS_ON
start = GridClock::now();
#endif
running = true;
}
void Stop(void) {
assert(running == true);
#ifdef TIMERS_ON
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
#endif
running = false;
};
void Reset(void){
running = false;
#ifdef TIMERS_ON
start = GridClock::now();
#endif
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
}
GridTime Elapsed(void) {

View File

@ -172,6 +172,9 @@ public:
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
// all elements of a simd vector must have same checkerboard.
//
// If Ls vectorised, this must still be the case; e.g.
// layout == 8 , require _rdimensions[d] >= 2;
if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0);
_osites *= _rdimensions[d];

View File

@ -288,40 +288,20 @@ PARALLEL_FOR_LOOP
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
DhopInternalCommsThenCompute(st,U,in,out,dag);
}
template<class Impl>
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) {
assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag);
st.HaloExchange(in,compressor);
if ( dag == DaggerYes ) {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
}
};

View File

@ -114,9 +114,6 @@ namespace Grid {
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ;
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ;
// Constructor
WilsonFermion(GaugeField &_Umu,
GridCartesian &Fgrid,

View File

@ -1,5 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -39,8 +38,6 @@ namespace QCD {
// S-direction is INNERMOST and takes no part in the parity.
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
int WilsonFermion5DStatic::HandOptDslash;
int WilsonFermion5DStatic::AsmOptDslash;
// 5d lattice for DWF.
template<class Impl>
@ -300,6 +297,8 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
return;
#if 0
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
@ -320,6 +319,7 @@ void WilsonFermion5D<Impl>::Report(void)
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
#endif
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
@ -342,25 +342,14 @@ template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
// assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
int LLs = in._grid->_rdimensions[0];
commtime -=usecond();
// auto handle = st.HaloExchangeBegin(in,compressor);
// st.HaloExchangeComplete(handle);
st.HaloExchange(in,compressor);
commtime +=usecond();
@ -368,59 +357,24 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
jointime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
dslashtime -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF=s+LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
if( this->AsmOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
}
} else {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
}
#pragma omp parallel for schedule(runtime)
for(int ss=0;ss<U._grid->oSites();ss++){
int sU=lo.Reorder(ss);
int sF=LLs*sU;
for(int s=0;s<LLs;s++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
sF++;
}
}
}
@ -473,7 +427,7 @@ FermOpTemplateInstantiate(WilsonFermion5D);
GparityFermOpTemplateInstantiate(WilsonFermion5D);
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
}}

View File

@ -49,8 +49,6 @@ namespace Grid {
class WilsonFermion5DStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static int AsmOptDslash; // these are a temporary hack
static int HandOptDslash; // these are a temporary hack
static const std::vector<int> directions;
static const std::vector<int> displacements;
const int npoint = 8;
@ -122,13 +120,6 @@ namespace Grid {
FermionField &out,
int dag);
void DhopInternalCommsThenCompute(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
// Constructors
WilsonFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -31,14 +31,43 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
namespace QCD {
int WilsonKernelsStatic::HandOpt;
int WilsonKernelsStatic::AsmOpt;
template<class Impl>
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
// Need controls to do interior, exterior, or both
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,in,out);
else if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
}
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
// No asm implementation yet.
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
// else
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
}
////////////////////////////////////////////
// Generic implementation; move to different file?
////////////////////////////////////////////
template<class Impl>
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
@ -214,9 +243,9 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
// Need controls to do interior, exterior, or both
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
@ -518,17 +547,9 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
vstream(out._odata[sF],result);
}
#if ( ! defined(AVX512) )
template<class Impl>
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
}
#endif
FermOpTemplateInstantiate(WilsonKernels);
template class WilsonKernels<DomainWallRedBlack5dImplF>;
template class WilsonKernels<DomainWallRedBlack5dImplD>;

View File

@ -38,14 +38,21 @@ namespace Grid {
// Helper routines that implement Wilson stencil for a single site.
// Common to both the WilsonFermion and WilsonFermion5D
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class WilsonKernelsStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static int AsmOpt; // these are a temporary hack
static int HandOpt; // these are a temporary hack
};
template<class Impl> class WilsonKernels : public FermionOperator<Impl> {
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
public:
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
@ -58,15 +65,26 @@ namespace Grid {
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
private:
// Specialised variants
void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in,FermionField &out);
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);

View File

@ -2,6 +2,8 @@
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
Copyright (C) 2015
@ -26,237 +28,75 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#if defined(AVX512)
//#if defined (IMCI)
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h>
namespace Grid {
namespace QCD {
///////////////////////////////////////////////////////////
// Default to no assembler implementation
///////////////////////////////////////////////////////////
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
uint64_t now;
uint64_t first ;
int offset,local,perm, ptype;
const SiteHalfSpinor *pbuf = & buf[0];
const SiteSpinor *plocal = & in._odata[0];
void *pf;
int osites = in._grid->oSites();
StencilEntry *SE;
//#define STAMP(i) timers[i] = cyclecount() ;
#define STAMP(i) //timers[i] = cyclecount() ;
MASK_REGS;
first = cyclecount();
SE=st.GetEntry(ptype,Xm,ss);
// Xm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Ym,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFXM(Xm,pf);
}
XP_RECON;
// Ym
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Zm,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFYM(Ym,pf);
}
YP_RECON_ACCUM;
// Zm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Tm,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFZM(Zm,pf);
}
ZP_RECON_ACCUM;
// Tm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
SE=st.GetEntry(ptype,Tp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
TP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFTM(Tm,pf);
}
TP_RECON_ACCUM;
// Tp
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Zp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
TM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFTP(Tp,pf);
}
TM_RECON_ACCUM;
// Zp
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Yp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFZP(Zp,pf);
}
ZM_RECON_ACCUM;
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Xp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFYP(Yp,pf);
}
YM_RECON_ACCUM;
// Xp
perm = SE->_permute;
offset = SE->_offset;
local = SE->_is_local;
// Prefetch
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFXP(Xp,pf);
}
XM_RECON_ACCUM;
debug:
SAVE_RESULT(&out._odata[ss]);
assert(0);
}
template class WilsonKernels<WilsonImplF>;
template class WilsonKernels<WilsonImplD>;
template class WilsonKernels<GparityWilsonImplF>;
template class WilsonKernels<GparityWilsonImplD>;
template class WilsonKernels<DomainWallRedBlack5dImplF>;
template class WilsonKernels<DomainWallRedBlack5dImplD>;
}}
#if defined(AVX512)
///////////////////////////////////////////////////////////
// If we are AVX512 specialise the single precision routine
///////////////////////////////////////////////////////////
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h>
static Vector<vComplexF> signs;
int setupSigns(void ){
Vector<vComplexF> bother(2);
signs = bother;
vrsign(signs[0]);
visign(signs[1]);
return 1;
}
static int signInit = setupSigns();
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr)
template<>
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef VMOVIDUP
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#define MAYBEPERM(A,B)
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
template<>
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#endif
template class WilsonKernels<WilsonImplF>;
template class WilsonKernels<WilsonImplD>;
template class WilsonKernels<GparityWilsonImplF>;
template class WilsonKernels<GparityWilsonImplD>;
template class WilsonKernels<DomainWallRedBlack5dImplF>;
template class WilsonKernels<DomainWallRedBlack5dImplD>;
}}

View File

@ -0,0 +1,154 @@
{
int locala,perma, ptypea;
int localb,permb, ptypeb;
uint64_t basea, baseb;
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0];
MASK_REGS;
////////////////////////////////
// Xp
////////////////////////////////
int ent=ss*8;// 2*Ndim
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns);
XM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXP(Xp,baseb);
}
LOAD64(%r10,isigns);
XM_RECON;
////////////////////////////////
// Yp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYP(Yp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YM_RECON_ACCUM;
////////////////////////////////
// Zp
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZP(Zp,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZM_RECON_ACCUM;
////////////////////////////////
// Tp
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTP(Tp,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TM_RECON_ACCUM;
////////////////////////////////
// Xm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR3,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFXM(Xm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
XP_RECON_ACCUM;
////////////////////////////////
// Ym
////////////////////////////////
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR2,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFYM(Ym,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
YP_RECON_ACCUM;
////////////////////////////////
// Zm
////////////////////////////////
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
if ( locala ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_PROJMEM(basea);
MAYBEPERM(PERMUTE_DIR1,perma);
} else {
LOAD_CHI(basea);
}
{
MULT_2SPIN_DIR_PFZM(Zm,baseb);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
ZP_RECON_ACCUM;
////////////////////////////////
// Tm
////////////////////////////////
basea = (uint64_t)&out._odata[ss];
if ( localb ) {
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_PROJMEM(baseb);
MAYBEPERM(PERMUTE_DIR0,permb);
} else {
LOAD_CHI(baseb);
}
{
MULT_2SPIN_DIR_PFTM(Tm,basea);
}
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
TP_RECON_ACCUM;
SAVE_RESULT(&out._odata[ss]);
}

View File

@ -312,7 +312,7 @@ namespace QCD {
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
@ -552,12 +552,10 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
vstream(ref()(3)(1),result_31);
vstream(ref()(3)(2),result_32);
}
return 0;
}
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
@ -798,7 +796,6 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
vstream(ref()(3)(1),result_31);
vstream(ref()(3)(2),result_32);
}
return 0;
}
@ -806,125 +803,80 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
// Specialise Gparity to simple implementation
////////////////////////////////////////////////
template<>
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
//////////////
/*
template<>
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
return 0;
}
////////////// Wilson ; uses this implementation /////////////////////
// Need Nc=3 though //
template<>
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
template<>
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
return 0;
}
*/
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out);

View File

@ -367,6 +367,9 @@ namespace Grid {
template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,S(0.0,0.0)); }// use xor?
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));}
template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));}
// if not complex overload here
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }

View File

@ -87,14 +87,39 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
/*
* TimesI is used only in the XP recon
* Could zero the regs and use RECON_ACCUM
*/
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
@ -111,6 +136,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
#if 0
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
@ -127,6 +154,35 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
#else
// o_p must point to floating 1.0f/d
//
// Ai, Ar -> tmp (r i)
// tmp *1.0
// ACC i - Ar ; ACC r + Ai
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
#define VACCTIMESMINUSI2f(A,ACC,tmp)
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
#define VACCTIMESMINUSI2d(A,ACC,tmp)
// Ai, Ar -> tmp (r i)
// tmp *1.0
// ACC i + Ar ; ACC r - Ai
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
#define VACCTIMESI2f(A,ACC,tmp)
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
#define VACCTIMESI2d(A,ACC,tmp)
#endif
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"

View File

@ -1,92 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Avx512Asm.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ASM_AV512_ADDSUB_H
#define GRID_ASM_AV512_ADDSUB_H
////////////////////////////////////////////////////////////////
// Building blocks for SU3 x 2spinor
// Load columns of U
// 18 U DUP's rr/ii
// 6 Chi shuffles ir,ri
// 6muls, 30 fmaddsubs
////////////////////////////////////////////////////////////////
#define MULT_ADDSUB_2SPIN(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
VMOVIDUPf(0,%r8,Z0 ) \
VMOVIDUPf(3,%r8,Z1 )\
VMOVIDUPf(6,%r8,Z2 )\
VSHUFf(Chi_00,T1) \
VSHUFf(Chi_10,T2) \
\
VMULf(Z0,T1,UChi_00) VMOVRDUPf(0,%r8,Z3 ) \
VMULf(Z0,T2,UChi_10) VMOVRDUPf(3,%r8,Z4 ) \
VMULf(Z1,T1,UChi_01) VMOVRDUPf(6,%r8,Z5 ) \
VMULf(Z1,T2,UChi_11) VMOVIDUPf(1,%r8,Z0 ) \
VMULf(Z2,T1,UChi_02) VMOVIDUPf(4,%r8,Z1 ) \
VMULf(Z2,T2,UChi_12) VMOVIDUPf(7,%r8,Z2 ) \
\
VMADDSUBf(Z3,Chi_00,UChi_00) VSHUFf(Chi_01,T1) \
VMADDSUBf(Z3,Chi_10,UChi_10) VSHUFf(Chi_11,T2) \
VMADDSUBf(Z4,Chi_00,UChi_01) VMOVRDUPf(1,%r8,Z3 ) \
VMADDSUBf(Z4,Chi_10,UChi_11)\
VMADDSUBf(Z5,Chi_00,UChi_02) VMOVRDUPf(4,%r8,Z4 ) \
VMADDSUBf(Z5,Chi_10,UChi_12)\
\
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(7,%r8,Z5 ) \
VMADDSUBf(Z0,T2,UChi_10)\
VMADDSUBf(Z1,T1,UChi_01) VMOVIDUPf(2,%r8,Z0 ) \
VMADDSUBf(Z1,T2,UChi_11)\
VMADDSUBf(Z2,T1,UChi_02) VMOVIDUPf(5,%r8,Z1 ) \
VMADDSUBf(Z2,T2,UChi_12) VMOVIDUPf(8,%r8,Z2 ) \
\
VMADDSUBf(Z3,Chi_01,UChi_00) VSHUFf(Chi_02,T1) \
VMADDSUBf(Z3,Chi_11,UChi_10) VSHUFf(Chi_12,T2) \
VMADDSUBf(Z4,Chi_01,UChi_01) VMOVRDUPf(2,%r8,Z3 ) \
VMADDSUBf(Z4,Chi_11,UChi_11)\
VMADDSUBf(Z5,Chi_01,UChi_02) VMOVRDUPf(5,%r8,Z4 ) \
VMADDSUBf(Z5,Chi_11,UChi_12)\
\
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(8,%r8,Z5 ) \
VMADDSUBf(Z0,T2,UChi_10)\
VMADDSUBf(Z1,T1,UChi_01)\
VMADDSUBf(Z1,T2,UChi_11)\
VMADDSUBf(Z2,T1,UChi_02)\
VMADDSUBf(Z2,T2,UChi_12)\
\
VMADDSUBf(Z3,Chi_02,UChi_00)\
VMADDSUBf(Z3,Chi_12,UChi_10)\
VMADDSUBf(Z4,Chi_02,UChi_01)\
VMADDSUBf(Z4,Chi_12,UChi_11)\
VMADDSUBf(Z5,Chi_02,UChi_02)\
VMADDSUBf(Z5,Chi_12,UChi_12)\
);
#endif

View File

@ -86,8 +86,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
#define VPREFETCHG(O,A)
#define VPREFETCHW(O,A)
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
#define VEVICT(O,A)
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"

View File

@ -133,3 +133,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#undef VRDUP
#undef VIDUP
#undef VMADDSUBMEM
#undef VMADDMEM
#undef VMULMEM
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST)
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST)
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
#undef VMADDSUBRDUP
#undef VMADDSUBIDUP
#undef VMULRDUP
#undef VMULIDUP
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)
#define VMULIDUP(O,P,B,accum) VMULIDUPd(O,P,B,accum)

View File

@ -116,7 +116,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
#define VSHUF(A,B) VSHUFf(A,B)
#undef ZEND1
#undef ZEND2
#undef ZLOAD
@ -133,3 +132,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#undef VRDUP
#undef VIDUP
#undef VMADDSUBMEM
#undef VMADDMEM
#undef VMULMEM
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST)
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST)
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
#undef VMADDSUBRDUP
#undef VMADDSUBIDUP
#undef VMULRDUP
#undef VMULIDUP
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)
#define VMULIDUP(O,P,B,accum) VMULIDUPf(O,P,B,accum)

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -27,9 +27,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#ifndef GRID_ASM_INTEL_512_QCD_H
#define GRID_ASM_INTEL_512_QCD_H
//////////////////////////////////////////////////////////////////////////////////////////
// Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept
// Register allocations for Wilson Kernel are precision indept
//////////////////////////////////////////////////////////////////////////////////////////
#define result_00 %zmm0
#define result_01 %zmm1
@ -64,7 +64,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define UChi_12 %zmm23
#define Uir %zmm24
//#define ONE %zmm24
#define Uri %zmm25
#define T1 %zmm24
#define T2 %zmm25
@ -92,13 +91,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define Chimu_32 UChi_12
#include <simd/Intel512common.h>
#ifdef AVX512
#include <simd/Intel512avx.h>
//#include <simd/Intel512avxAddsub.h> // Alternate implementation
#endif
#ifdef IMCI
#include <simd/Intel512imci.h>
#endif
//////////////////////////////////////////////////////////////////
// Macros used to build wilson kernel -- can rationalise and simplify
@ -193,47 +186,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSTORE(11,%r8,result_32) \
);
// auto ptr = &U._odata[sU](A);
// A plan for lifting loads
// can use Z2/3/4/5/U/U for U field in first step.
// can use Chi_00, Chi_10, U U for U field in second step
// can use Chi_00, Chi_10, Chi_01,11, U U for U field in third step
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
// Need detailed profile data to be sure.
#if 0
#define PREFETCH_U(A) \
LOAD64(%r8,&U._odata[sU](A)) \
__asm__ ( \
VPREFETCHG(0,%r8) \
VPREFETCHG(1,%r8) \
VPREFETCHG(2,%r8) \
VPREFETCHG(3,%r8) \
VPREFETCHG(4,%r8) \
VPREFETCHG(5,%r8) \
VPREFETCHG(6,%r8) \
VPREFETCHG(7,%r8) \
VPREFETCHG(8,%r8) );
#define PREFETCH_R(A) \
LOAD64(%r8,&out._odata[ss]) \
__asm__ ( \
VPREFETCHW(0,%r8) \
VPREFETCHW(1,%r8) \
VPREFETCHW(2,%r8) \
VPREFETCHW(3,%r8) \
VPREFETCHW(4,%r8) \
VPREFETCHW(5,%r8) \
VPREFETCHW(6,%r8) \
VPREFETCHW(7,%r8) \
VPREFETCHW(8,%r8) \
VPREFETCHW(9,%r8) \
VPREFETCHW(10,%r8) \
VPREFETCHW(11,%r8) );
#endif
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
@ -244,131 +196,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
#if 0
#define MULT_2SPIN_UNOPT(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
ZLOAD (0,%r8,UChi_01,UChi_11) \
ZLOAD (3,%r8,UChi_02,UChi_12) \
ZLOAD (6,%r8,Uri,Uir) \
ZMUL (UChi_01,UChi_11,Chi_00,UChi_00,Z0) \
ZMUL (UChi_01,UChi_11,Chi_10,UChi_10,Z1) \
ZMUL (UChi_02,UChi_12,Chi_00,UChi_01,Z2) \
ZMUL (UChi_02,UChi_12,Chi_10,UChi_11,Z3) \
ZMUL (Uri,Uir, Chi_00,UChi_02,Z4) \
ZMUL (Uri,Uir, Chi_10,UChi_12,Z5) \
\
ZLOAD (1,%r8,Uri,Uir) \
ZLOAD (4,%r8,Chi_00, Chi_10) \
ZMADD (Uri,Uir, Chi_01,UChi_00,Z0) \
ZMADD (Uri,Uir, Chi_11,UChi_10,Z1) \
ZLOAD (7,%r8,Uri,Uir) \
ZMADD (Chi_00, Chi_10,Chi_01,UChi_01,Z2) \
ZMADD (Chi_00, Chi_10,Chi_11,UChi_11,Z3) \
ZLOAD (2,%r8,Chi_00,Chi_10) \
ZMADD(Uri,Uir, Chi_01,UChi_02,Z4) \
ZMADD(Uri,Uir, Chi_11,UChi_12,Z5) \
\
ZLOAD (5,%r8,Uri,Uir) \
ZMADD (Chi_00,Chi_10, Chi_02,UChi_00,Z0) \
ZMADD (Chi_00,Chi_10, Chi_12,UChi_10,Z1) \
ZLOAD (8,%r8,Chi_00,Chi_10) \
ZMADD (Uri,Uir, Chi_02,UChi_01,Z2) \
ZMADD (Uri,Uir, Chi_12,UChi_11,Z3) \
ZMADD(Chi_00,Chi_10, Chi_02,UChi_02,Z4) \
ZMADD(Chi_00,Chi_10, Chi_12,UChi_12,Z5) \
\
ZEND1(UChi_00,Z0,Chi_01) \
ZEND1(UChi_10,Z1,Chi_11) \
ZEND1(UChi_01,Z2,Chi_00) \
ZEND1(UChi_11,Z3,Chi_10) \
ZEND1(UChi_02,Z4,Chi_02) \
ZEND1(UChi_12,Z5,Chi_12) \
ZEND2(UChi_00,Z0,Chi_01) \
ZEND2(UChi_10,Z1,Chi_11) \
ZEND2(UChi_01,Z2,Chi_00) \
ZEND2(UChi_11,Z3,Chi_10) \
ZEND2(UChi_02,Z4,Chi_02) \
ZEND2(UChi_12,Z5,Chi_12) );
#endif
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr)
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr)
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr)
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr)
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
// MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
#if 0
#define MULT_2SPIN_PF(ptr,pf,VPF) \
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
ZMULMEM2SP(0,%r8,Uri,Chi_00,Chi_10,UChi_00,Z0,UChi_10,Z1) \
VPF(0,%r9) \
ZMULMEM2SP(3,%r8,Uri,Chi_00,Chi_10,UChi_01,Z2,UChi_11,Z3) \
VPF(1,%r9) \
ZMULMEM2SP(6,%r8,Uri,Chi_00,Chi_10,UChi_02,Z4,UChi_12,Z5) \
VPF(2,%r9) \
\
ZMADDMEM2SP(1,%r8,Uri,Chi_01,Chi_11,UChi_00,Z0,UChi_10,Z1) \
VPF(3,%r9) \
ZMADDMEM2SP(4,%r8,Uri,Chi_01,Chi_11,UChi_01,Z2,UChi_11,Z3) \
VPF(4,%r9) \
ZMADDMEM2SP(7,%r8,Uri,Chi_01,Chi_11,UChi_02,Z4,UChi_12,Z5) \
VPF(5,%r9) \
\
ZMADDMEM2SP(2,%r8,Uri,Chi_02,Chi_12,UChi_00,Z0,UChi_10,Z1) \
VPF(6,%r9) \
ZMADDMEM2SP(5,%r8,Uri,Chi_02,Chi_12,UChi_01,Z2,UChi_11,Z3) \
VPF(7,%r9) \
ZMADDMEM2SP(8,%r8,Uri,Chi_02,Chi_12,UChi_02,Z4,UChi_12,Z5) \
VPF(8,%r9) \
\
ZEND1(UChi_00,Z0,Chi_01) \
ZEND1(UChi_10,Z1,Chi_11) \
ZEND1(UChi_01,Z2,Chi_00) \
ZEND1(UChi_11,Z3,Chi_10) \
VPF(9,%r9) \
ZEND1(UChi_02,Z4,Chi_02) \
ZEND1(UChi_12,Z5,Chi_12) \
ZEND2(UChi_00,Z0,Chi_01) \
ZEND2(UChi_10,Z1,Chi_11) \
VPF(10,%r9) \
ZEND2(UChi_01,Z2,Chi_00) \
ZEND2(UChi_11,Z3,Chi_10) \
ZEND2(UChi_02,Z4,Chi_02) \
VPF(11,%r9) \
ZEND2(UChi_12,Z5,Chi_12) );
#endif
#if 0
#define MULT_2SPIN_PFNONE(ptr,pf,VPF) \
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
VPF(0,%r9) \
VPF(1,%r9) \
VPF(2,%r9) \
\
VPF(3,%r9) \
VPF(4,%r9) \
VPF(5,%r9) \
\
VPF(6,%r9) \
VPF(7,%r9) \
VPF(8,%r9) \
\
VPF(9,%r9) \
VPF(10,%r9) \
VPF(11,%r9) );
#endif
// Pretty much Perfectly Pipelined
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
//////////////////////////////////////////////////////////////////
// Dirac algebra
@ -490,7 +325,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
LOAD64(%r8,ptr) \
__asm__ ( \
LOAD_CHIMU01i \
VSUBMEM(6,%r8 ,Chimu_00,Chi_00) \
VSUBMEM(6,%r8,Chimu_00,Chi_00) \
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
@ -503,18 +338,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// fspin(3)=timesMinusI(hspin(0))
#define XP_RECON __asm__ ( \
VZERO(TMP) \
VMOV(UChi_00,result_00) \
VMOV(UChi_01,result_01) \
VMOV(UChi_02,result_02) \
VMOV(UChi_10,result_10) \
VMOV(UChi_11,result_11) \
VMOV(UChi_12,result_12) \
VTIMESMINUSI0(UChi_10,result_20,TMP) \
VTIMESMINUSI0(UChi_11,result_21,TMP) \
VTIMESMINUSI0(UChi_12,result_22,TMP) \
VTIMESMINUSI0(UChi_00,result_30,TMP) \
VTIMESMINUSI0(UChi_10,result_20,TMP) \
VTIMESMINUSI0(UChi_01,result_31,TMP) \
VTIMESMINUSI0(UChi_11,result_21,TMP) \
VTIMESMINUSI0(UChi_02,result_32,TMP) \
VTIMESMINUSI0(UChi_12,result_22,TMP) \
VMOV(UChi_00,result_00) \
VMOV(UChi_10,result_10) \
VMOV(UChi_01,result_01) \
VMOV(UChi_11,result_11) \
VMOV(UChi_02,result_02) \
VMOV(UChi_12,result_12) \
VTIMESMINUSI1(UChi_10,result_20,TMP) \
VTIMESMINUSI1(UChi_11,result_21,TMP) \
VTIMESMINUSI1(UChi_12,result_22,TMP) \
@ -531,24 +366,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// NB could save 6 ops using addsub => 12 cycles
#define XP_RECON_ACCUM __asm__ ( \
VZERO(TMP)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
@ -559,24 +394,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define XM_RECON __asm__ ( \
VZERO(TMP)\
VMOV(UChi_00,result_00)\
VMOV(UChi_01,result_01)\
VMOV(UChi_02,result_02)\
VMOV(UChi_10,result_10)\
VMOV(UChi_11,result_11)\
VMOV(UChi_12,result_12)\
VTIMESI0(UChi_10,result_20,TMP)\
VTIMESI0(UChi_11,result_21,TMP)\
VTIMESI0(UChi_12,result_22,TMP)\
VTIMESI0(UChi_00,result_30,TMP)\
VTIMESI0(UChi_10,result_20,TMP)\
VTIMESI0(UChi_01,result_31,TMP)\
VTIMESI0(UChi_11,result_21,TMP)\
VTIMESI0(UChi_02,result_32,TMP)\
VTIMESI1(UChi_10,result_20,TMP)\
VTIMESI1(UChi_11,result_21,TMP)\
VTIMESI1(UChi_12,result_22,TMP)\
VTIMESI0(UChi_12,result_22,TMP)\
VMOV(UChi_00,result_00)\
VMOV(UChi_10,result_10)\
VMOV(UChi_01,result_01)\
VMOV(UChi_11,result_11)\
VMOV(UChi_02,result_02)\
VMOV(UChi_12,result_12)\
VTIMESI1(UChi_00,result_30,TMP)\
VTIMESI1(UChi_10,result_20,TMP)\
VTIMESI1(UChi_01,result_31,TMP)\
VTIMESI1(UChi_11,result_21,TMP)\
VTIMESI1(UChi_02,result_32,TMP)\
VTIMESI1(UChi_12,result_22,TMP)\
VTIMESI2(UChi_10,result_20,TMP)\
VTIMESI2(UChi_11,result_21,TMP)\
VTIMESI2(UChi_12,result_22,TMP)\
@ -586,23 +421,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
);
#define XM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESI0(UChi_10,result_20,Z0)\
VACCTIMESI0(UChi_11,result_21,Z1)\
VACCTIMESI0(UChi_12,result_22,Z2)\
VACCTIMESI0(UChi_00,result_30,Z3)\
VACCTIMESI0(UChi_11,result_21,Z1)\
VACCTIMESI0(UChi_01,result_31,Z4)\
VACCTIMESI0(UChi_12,result_22,Z2)\
VACCTIMESI0(UChi_02,result_32,Z5)\
\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_12,result_12,result_12)\
VADD(UChi_02,result_02,result_02)\
\
VACCTIMESI1(UChi_10,result_20,Z0)\
VACCTIMESI1(UChi_11,result_21,Z1)\
VACCTIMESI1(UChi_12,result_22,Z2)\
VACCTIMESI1(UChi_00,result_30,Z3)\
VACCTIMESI1(UChi_11,result_21,Z1)\
VACCTIMESI1(UChi_01,result_31,Z4)\
VACCTIMESI1(UChi_12,result_22,Z2)\
VACCTIMESI1(UChi_02,result_32,Z5)\
VACCTIMESI2(UChi_10,result_20,Z0)\
VACCTIMESI2(UChi_11,result_21,Z1)\
@ -614,10 +451,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define YP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VADD(UChi_10,result_20,result_20)\
VADD(UChi_11,result_21,result_21)\
@ -628,10 +465,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define YM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VSUB(UChi_10,result_20,result_20)\
VSUB(UChi_11,result_21,result_21)\
@ -641,23 +478,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VADD(UChi_02,result_32,result_32) );
#define ZP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
VACCTIMESI0(UChi_10,result_30,Z3)\
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
VACCTIMESI0(UChi_11,result_31,Z4)\
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
VACCTIMESI0(UChi_12,result_32,Z5)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
VACCTIMESI1(UChi_10,result_30,Z3)\
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
VACCTIMESI1(UChi_11,result_31,Z4)\
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
VACCTIMESI1(UChi_12,result_32,Z5)\
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
@ -668,23 +505,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
);
#define ZM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESI0(UChi_00,result_20,Z0)\
VACCTIMESI0(UChi_01,result_21,Z1)\
VACCTIMESI0(UChi_02,result_22,Z2)\
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
VACCTIMESI0(UChi_01,result_21,Z1)\
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
VACCTIMESI0(UChi_02,result_22,Z2)\
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
VADD(UChi_00,result_00,result_00)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VACCTIMESI1(UChi_00,result_20,Z0)\
VACCTIMESI1(UChi_01,result_21,Z1)\
VACCTIMESI1(UChi_02,result_22,Z2)\
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
VACCTIMESI1(UChi_01,result_21,Z1)\
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
VACCTIMESI1(UChi_02,result_22,Z2)\
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
VACCTIMESI2(UChi_00,result_20,Z0)\
VACCTIMESI2(UChi_01,result_21,Z1)\
@ -696,30 +533,30 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define TP_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VADD(UChi_00,result_20,result_20)\
VADD(UChi_01,result_21,result_21)\
VADD(UChi_02,result_22,result_22)\
VADD(UChi_10,result_30,result_30)\
VADD(UChi_01,result_21,result_21)\
VADD(UChi_11,result_31,result_31)\
VADD(UChi_02,result_22,result_22)\
VADD(UChi_12,result_32,result_32) );
#define TM_RECON_ACCUM __asm__ ( \
VADD(UChi_00,result_00,result_00)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_10,result_10,result_10)\
VADD(UChi_01,result_01,result_01)\
VADD(UChi_11,result_11,result_11)\
VADD(UChi_02,result_02,result_02)\
VADD(UChi_12,result_12,result_12)\
VSUB(UChi_00,result_20,result_20)\
VSUB(UChi_01,result_21,result_21)\
VSUB(UChi_02,result_22,result_22)\
VSUB(UChi_10,result_30,result_30)\
VSUB(UChi_01,result_21,result_21)\
VSUB(UChi_11,result_31,result_31)\
VSUB(UChi_02,result_22,result_22)\
VSUB(UChi_12,result_32,result_32) );
//define PREFETCH_CHIMU(A)
@ -758,63 +595,200 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define MULT_ADDSUB_2SPIN1(ptr) \
LOAD64(%r8,ptr)
/*
* __asm__ ( \
);
VMUL(Z0,%zmm2,%zmm3) \
*/
#define MULT_ADDSUB_2SPIN(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
VMOVIDUP(0,%r8,Z0 ) \
VMOVIDUP(3,%r8,Z1 )\
VMOVIDUP(6,%r8,Z2 )\
VSHUF(Chi_00,T1) \
VSHUF(Chi_10,T2) \
\
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
\
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
VMADDSUB(Z3,Chi_10,UChi_10) VSHUF(Chi_11,T2) \
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
VMADDSUB(Z4,Chi_10,UChi_11)\
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
VMADDSUB(Z5,Chi_10,UChi_12)\
\
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
VMADDSUB(Z0,T2,UChi_10)\
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
VMADDSUB(Z1,T2,UChi_11)\
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
\
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
VMADDSUB(Z3,Chi_11,UChi_10) VSHUF(Chi_12,T2) \
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
VMADDSUB(Z4,Chi_11,UChi_11)\
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
VMADDSUB(Z5,Chi_11,UChi_12)\
\
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
VMADDSUB(Z0,T2,UChi_10)\
VMADDSUB(Z1,T1,UChi_01)\
VMADDSUB(Z1,T2,UChi_11)\
VMADDSUB(Z2,T1,UChi_02)\
VMADDSUB(Z2,T2,UChi_12)\
\
VMADDSUB(Z3,Chi_02,UChi_00)\
VMADDSUB(Z3,Chi_12,UChi_10)\
VMADDSUB(Z4,Chi_02,UChi_01)\
VMADDSUB(Z4,Chi_12,UChi_11)\
VMADDSUB(Z5,Chi_02,UChi_02)\
VMADDSUB(Z5,Chi_12,UChi_12)\
);
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
#define MULT_ADDSUB_2SPIN(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
VSHUF(Chi_00,T1) \
VMOVIDUP(0,%r8,Z0 ) \
VMOVIDUP(3,%r8,Z1 ) \
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
/*6*/ \
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
/*18*/ \
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
VMADDSUB(Z3,Chi_10,UChi_10) \
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
VMADDSUB(Z5,Chi_10,UChi_12) \
/*28*/ \
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
VMADDSUB(Z0,T2,UChi_10) \
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
VMADDSUB(Z1,T2,UChi_11) \
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
/*38*/ \
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
VMADDSUB(Z3,Chi_11,UChi_10) \
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
VMADDSUB(Z5,Chi_11,UChi_12) \
/*48*/ \
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
VMADDSUB(Z0,T2,UChi_10) \
VMADDSUB(Z1,T1,UChi_01) \
VMADDSUB(Z1,T2,UChi_11) \
VMADDSUB(Z2,T1,UChi_02) \
VMADDSUB(Z2,T2,UChi_12) \
/*55*/ \
VMADDSUB(Z3,Chi_02,UChi_00) \
VMADDSUB(Z3,Chi_12,UChi_10) \
VMADDSUB(Z4,Chi_02,UChi_01) \
VMADDSUB(Z4,Chi_12,UChi_11) \
VMADDSUB(Z5,Chi_02,UChi_02) \
VMADDSUB(Z5,Chi_12,UChi_12) \
/*61 insns*/ );
#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
VPREFETCHG(0,%r9) \
VPREFETCHG(1,%r9) \
VPREFETCHG(2,%r9) \
VPREFETCHG(3,%r9) \
/*8*/ \
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
VPREFETCHG(4,%r9) \
VPREFETCHG(5,%r9) \
VPREFETCHG(6,%r9) \
VPREFETCHG(7,%r9) \
/*16*/ \
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
VPREFETCHG(8,%r9) \
VPREFETCHG(9,%r9) \
VPREFETCHG(10,%r9) \
VPREFETCHG(11,%r9) \
/*22*/ \
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
VPREFETCH2(12,%r9) \
VPREFETCH2(13,%r9) \
VPREFETCH2(14,%r9) \
VPREFETCH2(15,%r9) \
/*30*/ \
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
VPREFETCH2(16,%r9) \
VPREFETCH2(17,%r9) \
VPREFETCH2(18,%r9) \
VPREFETCH2(19,%r9) \
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
/*36*/ \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
VPREFETCH2(20,%r9) \
VPREFETCH2(21,%r9) \
VPREFETCH2(22,%r9) \
VPREFETCH2(23,%r9) \
VPREFETCHG(2,%r8) \
VPREFETCHG(3,%r8) \
VPREFETCH2(4,%r8) \
VPREFETCH2(5,%r8) \
/*42 insns*/ );
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
LOAD64(%r8,ptr) \
LOAD64(%r9,pf) \
__asm__ ( \
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
/*8*/ \
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
/*16*/ \
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
/*22*/ \
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
/*30*/ \
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
/*36*/ \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
/* VPREFETCHG(2,%r8)*/ \
/* VPREFETCHG(3,%r8)*/ \
/*42 insns*/ );
#define Z6 Chi_00
#define MULT_ADDSUB_2SPIN_NEW(ptr) \
LOAD64(%r8,ptr) \
__asm__ ( \
VSHUFMEM(0,%r8,Z0) \
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
VSHUFMEM(3,%r8,Z0) \
VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
VSHUFMEM(6,%r8,Z0) \
VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
/*11 cycles*/ \
VSHUFMEM(1,%r8,Z0) \
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
VSHUFMEM(4,%r8,Z0) \
VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
VSHUFMEM(7,%r8,Z0) \
VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
/*22 cycles*/ \
VSHUFMEM(2,%r8,Z0) \
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
VSHUFMEM(5,%r8,Z0) \
VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
VSHUFMEM(8,%r8,Z0) \
VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
/*33 cycles*/ \
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
/*stall*/ \
/*stall*/ \
/*stall*/ \
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
#endif

View File

@ -103,7 +103,9 @@ void LebesgueOrder::IterateI(int ND,
} else {
for(int d=0;d<ND;d++){
x[d]=xi[d]+xo[d];
std::cout << x[d]<<" ";
}
std::cout << "\n";
IndexInteger index;
Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
_LebesgueReorder.push_back(index);
@ -188,6 +190,7 @@ void LebesgueOrder::ZGraph(void)
}
assert( _LebesgueReorder.size() == vol );
/*
std::vector<int> coor(4);
for(IndexInteger asite=0;asite<vol;asite++){
grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
@ -198,5 +201,6 @@ void LebesgueOrder::ZGraph(void)
<< coor[3]<<"]"
<<std::endl;
}
*/
}
}