mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 15:55:37 +00:00
Merge branch 'master' into hadrons
This commit is contained in:
commit
3834d81181
@ -58,7 +58,7 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
|
||||
std::vector<int> latt4 = GridDefaultLatt();
|
||||
const int Ls=16;
|
||||
const int Ls=8;
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
@ -143,6 +143,7 @@ int main (int argc, char ** argv)
|
||||
Dw.Report();
|
||||
}
|
||||
|
||||
exit(0);
|
||||
|
||||
if (1)
|
||||
{ // Naive wilson dag implementation
|
||||
@ -197,7 +198,6 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||
}
|
||||
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
||||
Dw.Dhop (src ,result,DaggerNo);
|
||||
|
171
benchmarks/Benchmark_zmm.cc
Normal file
171
benchmarks/Benchmark_zmm.cc
Normal file
@ -0,0 +1,171 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_zmm.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
#include <PerfCount.h>
|
||||
|
||||
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
|
||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
|
||||
|
||||
int main(int argc,char **argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
std::ofstream os("zmm.dat");
|
||||
|
||||
os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
|
||||
for(int L=4;L<=32;L+=4){
|
||||
for(int m=1;m<=2;m++){
|
||||
for(int Ls=8;Ls<=16;Ls+=8){
|
||||
std::vector<int> grid({L,L,m*L,m*L});
|
||||
for(int i=0;i<4;i++) {
|
||||
std::cout << grid[i]<<"x";
|
||||
}
|
||||
std::cout << Ls<<std::endl;
|
||||
bench(os,grid,Ls);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
||||
{
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
|
||||
|
||||
LatticeFermion src (FGrid);
|
||||
LatticeFermion tmp (FGrid);
|
||||
LatticeFermion srce(FrbGrid);
|
||||
|
||||
LatticeFermion resulto(FrbGrid); resulto=zero;
|
||||
LatticeFermion resulta(FrbGrid); resulta=zero;
|
||||
LatticeFermion junk(FrbGrid); junk=zero;
|
||||
LatticeFermion diff(FrbGrid);
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
|
||||
double mfc, mfa, mfo, mfl1;
|
||||
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
random(RNG5,src);
|
||||
#if 1
|
||||
random(RNG4,Umu);
|
||||
#else
|
||||
int mmu=2;
|
||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
if ( mu!=mmu ) U[mu] = zero;
|
||||
if ( mu==mmu ) U[mu] = 1.0;
|
||||
PokeIndex<LorentzIndex>(Umu,U[mu],mu);
|
||||
}
|
||||
#endif
|
||||
pickCheckerboard(Even,srce,src);
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||
int ncall=50;
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.DhopOE(srce,resulto,0);
|
||||
}
|
||||
double t1=usecond();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=1344*volume/2;
|
||||
|
||||
mfc = flops*ncall/(t1-t0);
|
||||
std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s = "<< mfc<<std::endl;
|
||||
|
||||
QCD::WilsonFermion5DStatic::AsmOptDslash=1;
|
||||
t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.DhopOE(srce,resulta,0);
|
||||
}
|
||||
t1=usecond();
|
||||
mfa = flops*ncall/(t1-t0);
|
||||
std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s = "<< mfa<<std::endl;
|
||||
|
||||
int dag=DaggerNo;
|
||||
t0=usecond();
|
||||
for(int i=0;i<1;i++){
|
||||
Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
||||
}
|
||||
t1=usecond();
|
||||
mfo = flops*100/(t1-t0);
|
||||
std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s = "<< mfo<<std::endl;
|
||||
|
||||
t0=usecond();
|
||||
for(int i=0;i<1;i++){
|
||||
Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
||||
}
|
||||
t1=usecond();
|
||||
mfl1= flops*100/(t1-t0);
|
||||
std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s = "<< mfl1<<std::endl;
|
||||
|
||||
os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
|
||||
<< mfc<<" "
|
||||
<< mfa<<" "
|
||||
<< mfo<<" "
|
||||
<< mfl1<<std::endl;
|
||||
|
||||
#if 0
|
||||
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
Dw.DhopOE(srce,resulta,0);
|
||||
PerformanceCounter Counter(i);
|
||||
Counter.Start();
|
||||
Dw.DhopOE(srce,resulta,0);
|
||||
Counter.Stop();
|
||||
Counter.Report();
|
||||
}
|
||||
#endif
|
||||
//resulta = (-0.5) * resulta;
|
||||
|
||||
diff = resulto-resulta;
|
||||
std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
|
||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
|
||||
|
||||
|
||||
Benchmark_comms_SOURCES=Benchmark_comms.cc
|
||||
@ -25,3 +25,7 @@ Benchmark_su3_LDADD=-lGrid
|
||||
Benchmark_wilson_SOURCES=Benchmark_wilson.cc
|
||||
Benchmark_wilson_LDADD=-lGrid
|
||||
|
||||
|
||||
Benchmark_zmm_SOURCES=Benchmark_zmm.cc
|
||||
Benchmark_zmm_LDADD=-lGrid
|
||||
|
||||
|
183
lib/Config.h.in
183
lib/Config.h.in
@ -1,183 +0,0 @@
|
||||
/* lib/Config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* AVX Intrinsics */
|
||||
#undef AVX1
|
||||
|
||||
/* AVX2 Intrinsics */
|
||||
#undef AVX2
|
||||
|
||||
/* AVX512 Intrinsics for Knights Landing */
|
||||
#undef AVX512
|
||||
|
||||
/* AVX Intrinsics with FMA4 */
|
||||
#undef AVXFMA4
|
||||
|
||||
/* EMPTY_SIMD only for DEBUGGING */
|
||||
#undef EMPTY_SIMD
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
#undef GRID_COMMS_MPI
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
#undef GRID_COMMS_NONE
|
||||
|
||||
/* GRID_COMMS_SHMEM */
|
||||
#undef GRID_COMMS_SHMEM
|
||||
|
||||
/* GRID_DEFAULT_PRECISION is DOUBLE */
|
||||
#undef GRID_DEFAULT_PRECISION_DOUBLE
|
||||
|
||||
/* GRID_DEFAULT_PRECISION is SINGLE */
|
||||
#undef GRID_DEFAULT_PRECISION_SINGLE
|
||||
|
||||
/* Support Altivec instructions */
|
||||
#undef HAVE_ALTIVEC
|
||||
|
||||
/* Support AVX (Advanced Vector Extensions) instructions */
|
||||
#undef HAVE_AVX
|
||||
|
||||
/* Support AVX2 (Advanced Vector Extensions 2) instructions */
|
||||
#undef HAVE_AVX2
|
||||
|
||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
|
||||
don't. */
|
||||
#undef HAVE_DECL_BE64TOH
|
||||
|
||||
/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
|
||||
*/
|
||||
#undef HAVE_DECL_NTOHLL
|
||||
|
||||
/* Define to 1 if you have the <endian.h> header file. */
|
||||
#undef HAVE_ENDIAN_H
|
||||
|
||||
/* Define to 1 if you have the <execinfo.h> header file. */
|
||||
#undef HAVE_EXECINFO_H
|
||||
|
||||
/* Support FMA3 (Fused Multiply-Add) instructions */
|
||||
#undef HAVE_FMA
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#undef HAVE_GETTIMEOFDAY
|
||||
|
||||
/* Define to 1 if you have the <gmp.h> header file. */
|
||||
#undef HAVE_GMP_H
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#undef HAVE_INTTYPES_H
|
||||
|
||||
/* Define to 1 if you have the <malloc.h> header file. */
|
||||
#undef HAVE_MALLOC_H
|
||||
|
||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
|
||||
#undef HAVE_MALLOC_MALLOC_H
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#undef HAVE_MEMORY_H
|
||||
|
||||
/* Support mmx instructions */
|
||||
#undef HAVE_MMX
|
||||
|
||||
/* Define to 1 if you have the <mm_malloc.h> header file. */
|
||||
#undef HAVE_MM_MALLOC_H
|
||||
|
||||
/* Support SSE (Streaming SIMD Extensions) instructions */
|
||||
#undef HAVE_SSE
|
||||
|
||||
/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
|
||||
#undef HAVE_SSE2
|
||||
|
||||
/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
|
||||
#undef HAVE_SSE3
|
||||
|
||||
/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
|
||||
#undef HAVE_SSE4_1
|
||||
|
||||
/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
|
||||
#undef HAVE_SSE4_2
|
||||
|
||||
/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
|
||||
#undef HAVE_SSSE3
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#undef HAVE_STDINT_H
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#undef HAVE_STDLIB_H
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#undef HAVE_STRINGS_H
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#undef HAVE_STRING_H
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#undef HAVE_SYS_STAT_H
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#undef HAVE_SYS_TYPES_H
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#undef HAVE_UNISTD_H
|
||||
|
||||
/* IMCI Intrinsics for Knights Corner */
|
||||
#undef IMCI
|
||||
|
||||
/* NEON ARMv8 Experimental support */
|
||||
#undef NEONv8
|
||||
|
||||
/* Name of package */
|
||||
#undef PACKAGE
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#undef PACKAGE_BUGREPORT
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#undef PACKAGE_NAME
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#undef PACKAGE_STRING
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
/* RNG_MT19937 */
|
||||
#undef RNG_MT19937
|
||||
|
||||
/* RNG_RANLUX */
|
||||
#undef RNG_RANLUX
|
||||
|
||||
/* SSE4 Intrinsics */
|
||||
#undef SSE4
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
#undef STDC_HEADERS
|
||||
|
||||
/* Version number of package */
|
||||
#undef VERSION
|
||||
|
||||
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
|
||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
||||
#define below would cause a syntax error. */
|
||||
#undef _UINT32_T
|
||||
|
||||
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
|
||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
||||
#define below would cause a syntax error. */
|
||||
#undef _UINT64_T
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
#undef size_t
|
||||
|
||||
/* Define to the type of an unsigned integer type of width exactly 32 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
#undef uint32_t
|
||||
|
||||
/* Define to the type of an unsigned integer type of width exactly 64 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
#undef uint64_t
|
@ -62,6 +62,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <serialisation/Serialisation.h>
|
||||
#include <Config.h>
|
||||
#include <Timer.h>
|
||||
#include <PerfCount.h>
|
||||
#include <Log.h>
|
||||
#include <AlignedAllocator.h>
|
||||
#include <Simd.h>
|
||||
|
@ -318,7 +318,10 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
exit(0);
|
||||
return;
|
||||
};
|
||||
|
||||
#ifdef GRID_FPE
|
||||
#define _GNU_SOURCE
|
||||
#include <fenv.h>
|
||||
#endif
|
||||
void Grid_debug_handler_init(void)
|
||||
{
|
||||
struct sigaction sa,osa;
|
||||
@ -327,5 +330,9 @@ void Grid_debug_handler_init(void)
|
||||
sa.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGSEGV,&sa,NULL);
|
||||
sigaction(SIGTRAP,&sa,NULL);
|
||||
#ifdef GRID_FPE
|
||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||
sigaction(SIGFPE,&sa,NULL);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
|
||||
////////////////////////////////////////////////////////////
|
||||
void Grid_quiesce_nodes(void)
|
||||
{
|
||||
int me;
|
||||
int me=0;
|
||||
#ifdef GRID_COMMS_MPI
|
||||
MPI_Comm_rank(MPI_COMM_WORLD,&me);
|
||||
#endif
|
||||
|
@ -71,7 +71,7 @@ public:
|
||||
StopWatch.Start();
|
||||
stream << BLACK<< log.topName << BLACK<< " : ";
|
||||
stream << log.COLOUR <<std::setw(10) << std::left << log.name << BLACK << " : ";
|
||||
// stream << YELLOW<< now <<BLACK << " : " ;
|
||||
stream << YELLOW<< now <<BLACK << " : " ;
|
||||
stream << log.COLOUR;
|
||||
return stream;
|
||||
} else {
|
||||
|
File diff suppressed because one or more lines are too long
@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <ctime>
|
||||
#include <chrono>
|
||||
#include <string.h>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <sys/ioctl.h>
|
||||
|
||||
#ifdef __linux__
|
||||
@ -163,8 +163,8 @@ public:
|
||||
{
|
||||
#ifdef __linux__
|
||||
if ( fd!= -1) {
|
||||
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
||||
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
||||
::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
||||
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
||||
}
|
||||
begin =cyclecount();
|
||||
#else
|
||||
@ -176,7 +176,7 @@ public:
|
||||
count=0;
|
||||
#ifdef __linux__
|
||||
if ( fd!= -1) {
|
||||
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
||||
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
||||
::read(fd, &count, sizeof(long long));
|
||||
}
|
||||
elapsed = cyclecount() - begin;
|
||||
@ -187,16 +187,16 @@ public:
|
||||
}
|
||||
void Report(void) {
|
||||
#ifdef __linux__
|
||||
printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
|
||||
std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
|
||||
#else
|
||||
printf("%llu cycles \n", elapsed );
|
||||
std::printf("%llu cycles \n", elapsed );
|
||||
#endif
|
||||
}
|
||||
|
||||
~PerformanceCounter()
|
||||
{
|
||||
#ifdef __linux__
|
||||
close(fd);
|
||||
::close(fd);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -42,6 +42,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
|
||||
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
|
||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
|
||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
||||
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
||||
|
13
lib/Timer.h
13
lib/Timer.h
@ -44,6 +44,7 @@ double usecond(void);
|
||||
typedef std::chrono::system_clock GridClock;
|
||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||
typedef std::chrono::milliseconds GridTime;
|
||||
typedef std::chrono::microseconds GridUsecs;
|
||||
|
||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
||||
{
|
||||
@ -55,7 +56,7 @@ class GridStopWatch {
|
||||
private:
|
||||
bool running;
|
||||
GridTimePoint start;
|
||||
GridTime accumulator;
|
||||
GridUsecs accumulator;
|
||||
public:
|
||||
GridStopWatch () {
|
||||
Reset();
|
||||
@ -67,17 +68,21 @@ public:
|
||||
}
|
||||
void Stop(void) {
|
||||
assert(running == true);
|
||||
accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start);
|
||||
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
|
||||
running = false;
|
||||
};
|
||||
void Reset(void){
|
||||
running = false;
|
||||
start = GridClock::now();
|
||||
accumulator = std::chrono::duration_cast<GridTime>(start-start);
|
||||
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
||||
}
|
||||
GridTime Elapsed(void) {
|
||||
assert(running == false);
|
||||
return accumulator;
|
||||
return std::chrono::duration_cast<GridTime>( accumulator );
|
||||
}
|
||||
uint64_t useconds(void){
|
||||
assert(running == false);
|
||||
return (uint64_t) accumulator.count();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -147,6 +147,56 @@ namespace Grid {
|
||||
}
|
||||
Orthogonalise();
|
||||
}
|
||||
|
||||
virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
|
||||
{
|
||||
// Run a Lanczos with sloppy convergence
|
||||
const int Nstop = nn;
|
||||
const int Nk = nn+20;
|
||||
const int Np = nn+20;
|
||||
const int Nm = Nk+Np;
|
||||
const int MaxIt= 10000;
|
||||
RealD resid = 1.0e-3;
|
||||
|
||||
Chebyshev<FineField> Cheb(0.5,64.0,21);
|
||||
ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
|
||||
// IRL.lock = 1;
|
||||
|
||||
FineField noise(FineGrid); gaussian(RNG,noise);
|
||||
FineField tmp(FineGrid);
|
||||
std::vector<RealD> eval(Nm);
|
||||
std::vector<FineField> evec(Nm,FineGrid);
|
||||
|
||||
int Nconv;
|
||||
IRL.calc(eval,evec,
|
||||
noise,
|
||||
Nconv);
|
||||
|
||||
// pull back nn vectors
|
||||
for(int b=0;b<nn;b++){
|
||||
|
||||
subspace[b] = evec[b];
|
||||
|
||||
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
|
||||
|
||||
hermop.Op(subspace[b],tmp);
|
||||
std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
|
||||
|
||||
noise = tmp - sqrt(eval[b])*subspace[b] ;
|
||||
|
||||
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
|
||||
|
||||
noise = tmp + eval[b]*subspace[b] ;
|
||||
|
||||
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
|
||||
|
||||
}
|
||||
Orthogonalise();
|
||||
for(int b=0;b<nn;b++){
|
||||
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
||||
|
||||
RealD scale;
|
||||
|
@ -39,42 +39,33 @@ class SortEigen {
|
||||
private:
|
||||
|
||||
//hacking for testing for now
|
||||
#if 0
|
||||
static bool less_lmd(RealD left,RealD right){
|
||||
return fabs(left) < fabs(right);
|
||||
}
|
||||
static bool less_pair(std::pair<RealD,Field>& left,
|
||||
std::pair<RealD,Field>& right){
|
||||
return fabs(left.first) < fabs(right.first);
|
||||
}
|
||||
#else
|
||||
private:
|
||||
static bool less_lmd(RealD left,RealD right){
|
||||
return left > right;
|
||||
}
|
||||
static bool less_pair(std::pair<RealD,Field>& left,
|
||||
std::pair<RealD,Field>& right){
|
||||
static bool less_pair(std::pair<RealD,Field const*>& left,
|
||||
std::pair<RealD,Field const*>& right){
|
||||
return left.first > (right.first);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
public:
|
||||
|
||||
void push(DenseVector<RealD>& lmd,
|
||||
DenseVector<Field>& evec,int N) {
|
||||
|
||||
DenseVector<std::pair<RealD, Field> > emod;
|
||||
typename DenseVector<std::pair<RealD, Field> >::iterator it;
|
||||
DenseVector<Field>& evec,int N) {
|
||||
DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
|
||||
for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
|
||||
|
||||
for(int i=0;i<lmd.size();++i){
|
||||
emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
|
||||
}
|
||||
DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());
|
||||
for(int i=0;i<lmd.size();++i)
|
||||
emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
|
||||
|
||||
partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
|
||||
|
||||
it=emod.begin();
|
||||
typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
|
||||
for(int i=0;i<N;++i){
|
||||
lmd[i]=it->first;
|
||||
evec[i]=it->second;
|
||||
evec[i]=*(it->second);
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
@ -637,21 +637,20 @@ until convergence
|
||||
abort();
|
||||
|
||||
converged:
|
||||
// Sorting
|
||||
|
||||
eval.clear();
|
||||
evec.clear();
|
||||
for(int i=0; i<Nconv; ++i){
|
||||
eval.push_back(eval2[Iconv[i]]);
|
||||
evec.push_back(B[Iconv[i]]);
|
||||
}
|
||||
_sort.push(eval,evec,Nconv);
|
||||
|
||||
std::cout << "\n Converged\n Summary :\n";
|
||||
std::cout << " -- Iterations = "<< Nconv << "\n";
|
||||
std::cout << " -- beta(k) = "<< beta_k << "\n";
|
||||
std::cout << " -- Nconv = "<< Nconv << "\n";
|
||||
}
|
||||
// Sorting
|
||||
eval.resize(Nconv);
|
||||
evec.resize(Nconv,grid);
|
||||
for(int i=0; i<Nconv; ++i){
|
||||
eval[i] = eval2[Iconv[i]];
|
||||
evec[i] = B[Iconv[i]];
|
||||
}
|
||||
_sort.push(eval,evec,Nconv);
|
||||
|
||||
std::cout << "\n Converged\n Summary :\n";
|
||||
std::cout << " -- Iterations = "<< Nconv << "\n";
|
||||
std::cout << " -- beta(k) = "<< beta_k << "\n";
|
||||
std::cout << " -- Nconv = "<< Nconv << "\n";
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////
|
||||
// Adapted from Rudy's lanczos factor routine
|
||||
|
@ -192,7 +192,7 @@ namespace Grid {
|
||||
return cp;
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage<< " VPGCR_step resid " <<sqrt(cp/rsq)<<std::endl;
|
||||
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
|
||||
|
||||
PrecTimer.Start();
|
||||
Preconditioner(r,z);// solve Az = r
|
||||
|
@ -86,6 +86,7 @@ class CartesianCommunicator {
|
||||
void GlobalSumVector(RealD *,int N);
|
||||
|
||||
void GlobalSum(uint32_t &);
|
||||
void GlobalSum(uint64_t &);
|
||||
|
||||
void GlobalSum(ComplexF &c)
|
||||
{
|
||||
|
@ -73,6 +73,10 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
|
@ -53,6 +53,7 @@ void CartesianCommunicator::GlobalSum(float &){}
|
||||
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
||||
void CartesianCommunicator::GlobalSum(double &){}
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &){}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &){}
|
||||
void CartesianCommunicator::GlobalSumVector(double *,int N){}
|
||||
|
||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||
|
@ -101,6 +101,22 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
shmem_barrier_all(); // necessary?
|
||||
u = dest;
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
static long long source ;
|
||||
static long long dest ;
|
||||
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||
|
||||
// int nreduce=1;
|
||||
// int pestart=0;
|
||||
// int logStride=0;
|
||||
|
||||
source = u;
|
||||
dest = 0;
|
||||
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
|
||||
shmem_barrier_all(); // necessary?
|
||||
u = dest;
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
static float source ;
|
||||
static float dest ;
|
||||
|
@ -168,6 +168,7 @@ class BinaryIO {
|
||||
GridBase *grid = Umu._grid;
|
||||
|
||||
std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
|
||||
GridStopWatch timer; timer.Start();
|
||||
|
||||
int ieee32big = (format == std::string("IEEE32BIG"));
|
||||
int ieee32 = (format == std::string("IEEE32"));
|
||||
@ -182,6 +183,7 @@ class BinaryIO {
|
||||
|
||||
Umu = zero;
|
||||
uint32_t csum=0;
|
||||
uint64_t bytes=0;
|
||||
fobj file_object;
|
||||
sobj munged;
|
||||
|
||||
@ -194,7 +196,7 @@ class BinaryIO {
|
||||
|
||||
if ( grid->IsBoss() ) {
|
||||
fin.read((char *)&file_object,sizeof(file_object));
|
||||
|
||||
bytes += sizeof(file_object);
|
||||
if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
|
||||
if(ieee32) le32toh_v((void *)&file_object,sizeof(file_object));
|
||||
if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
|
||||
@ -205,6 +207,10 @@ class BinaryIO {
|
||||
// The boss who read the file has their value poked
|
||||
pokeSite(munged,Umu,site);
|
||||
}}}}
|
||||
timer.Stop();
|
||||
std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||
<< (double)bytes/ (double)timer.useconds() <<" MB/s " <<std::endl;
|
||||
|
||||
return csum;
|
||||
}
|
||||
|
||||
@ -224,13 +230,14 @@ class BinaryIO {
|
||||
// Serialise through node zero
|
||||
//////////////////////////////////////////////////
|
||||
std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
|
||||
GridStopWatch timer; timer.Start();
|
||||
|
||||
std::ofstream fout;
|
||||
if ( grid->IsBoss() ) {
|
||||
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
|
||||
fout.seekp(offset);
|
||||
}
|
||||
|
||||
uint64_t bytes=0;
|
||||
uint32_t csum=0;
|
||||
fobj file_object;
|
||||
sobj unmunged;
|
||||
@ -252,10 +259,15 @@ class BinaryIO {
|
||||
if(ieee32) htole32_v((void *)&file_object,sizeof(file_object));
|
||||
if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
|
||||
if(ieee64) htole64_v((void *)&file_object,sizeof(file_object));
|
||||
|
||||
|
||||
// NB could gather an xstrip as an optimisation.
|
||||
fout.write((char *)&file_object,sizeof(file_object));
|
||||
bytes+=sizeof(file_object);
|
||||
}
|
||||
}}}}
|
||||
timer.Stop();
|
||||
std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||
|
||||
return csum;
|
||||
}
|
||||
@ -429,6 +441,9 @@ class BinaryIO {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
GridStopWatch timer; timer.Start();
|
||||
uint64_t bytes=0;
|
||||
|
||||
int myrank = grid->ThisRank();
|
||||
int iorank = grid->RankFromProcessorCoor(ioproc);
|
||||
|
||||
@ -475,6 +490,7 @@ class BinaryIO {
|
||||
|
||||
fin.seekg(offset+g_idx*sizeof(fileObj));
|
||||
fin.read((char *)&fileObj,sizeof(fileObj));
|
||||
bytes+=sizeof(fileObj);
|
||||
|
||||
if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
|
||||
if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj));
|
||||
@ -499,7 +515,12 @@ class BinaryIO {
|
||||
}
|
||||
|
||||
grid->GlobalSum(csum);
|
||||
grid->GlobalSum(bytes);
|
||||
grid->Barrier();
|
||||
|
||||
timer.Stop();
|
||||
std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||
|
||||
return csum;
|
||||
}
|
||||
@ -563,6 +584,9 @@ class BinaryIO {
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
GridStopWatch timer; timer.Start();
|
||||
uint64_t bytes=0;
|
||||
|
||||
int myrank = grid->ThisRank();
|
||||
int iorank = grid->RankFromProcessorCoor(ioproc);
|
||||
|
||||
@ -635,11 +659,16 @@ class BinaryIO {
|
||||
|
||||
fout.seekp(offset+g_idx*sizeof(fileObj));
|
||||
fout.write((char *)&fileObj,sizeof(fileObj));
|
||||
|
||||
bytes+=sizeof(fileObj);
|
||||
}
|
||||
}
|
||||
|
||||
grid->GlobalSum(csum);
|
||||
grid->GlobalSum(bytes);
|
||||
|
||||
timer.Stop();
|
||||
std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||
|
||||
return csum;
|
||||
}
|
||||
|
@ -353,7 +353,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
||||
csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
|
||||
(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
|
||||
}
|
||||
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
|
||||
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
|
||||
if ( ieee32 || ieee32big ) {
|
||||
//csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
||||
csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
||||
@ -431,7 +431,7 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
|
||||
|
||||
} else {
|
||||
header.floating_point = std::string("IEEE64BIG");
|
||||
header.data_type = std::string("4D_SU3_GAUGE_3X3");
|
||||
header.data_type = std::string("4D_SU3_GAUGE_3x3");
|
||||
NerscSimpleUnmunger<fobj3D,sobj> munge;
|
||||
BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
|
||||
offset = writeHeader(header,file);
|
||||
|
@ -335,69 +335,7 @@ PARALLEL_FOR_LOOP
|
||||
void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag) {
|
||||
|
||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
|
||||
bool local = true;
|
||||
bool nonlocal = false;
|
||||
if ( dag == DaggerYes ) {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
st.HaloExchangeComplete(handle);
|
||||
|
||||
local = false;
|
||||
nonlocal = true;
|
||||
if ( dag == DaggerYes ) {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(0);
|
||||
|
||||
};
|
||||
|
||||
|
@ -281,11 +281,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
// if ( Impl::overlapCommsCompute () ) {
|
||||
// DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
|
||||
// } else {
|
||||
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
|
||||
// }
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
@ -368,7 +364,7 @@ PARALLEL_FOR_LOOP
|
||||
sU = lo.Reorder(sU);
|
||||
}
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -423,14 +419,12 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
assert(0);
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
alltime-=usecond();
|
||||
|
||||
Compressor compressor(dag);
|
||||
|
||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||
@ -442,116 +436,232 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
|
||||
|
||||
commtime -=usecond();
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
st.HaloExchangeComplete(handle);
|
||||
commtime +=usecond();
|
||||
|
||||
jointime -=usecond();
|
||||
jointime +=usecond();
|
||||
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
// Not loop ordering and data layout.
|
||||
// Designed to create
|
||||
// - per thread reuse in L1 cache for U
|
||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
||||
bool local = true;
|
||||
bool nonlocal = false;
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
for(int jjj=0;jjj<100;jjj++){
|
||||
#pragma omp barrier
|
||||
dslashtime -=usecond();
|
||||
if ( dag == DaggerYes ) {
|
||||
if( this->HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
{
|
||||
int sd;
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=ss;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( this->HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
if( this->AsmOptDslash ) {
|
||||
// for(int i=0;i<1;i++){
|
||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
// PerformanceCounter Counter(i);
|
||||
// Counter.Start();
|
||||
|
||||
#pragma omp for
|
||||
for(int t=0;t<threads;t++){
|
||||
|
||||
int hyperthread = t%HT;
|
||||
int core = t/HT;
|
||||
|
||||
int sswork, swork,soff,ssoff, sU,sF;
|
||||
|
||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
||||
|
||||
for(int ss=0;ss<sswork;ss++){
|
||||
for(int s=soff;s<soff+swork;s++){
|
||||
|
||||
sU=ss+ ssoff;
|
||||
|
||||
if ( LebesgueOrder::UseLebesgueOrder ) {
|
||||
sU = lo.Reorder(sU);
|
||||
}
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Counter.Stop();
|
||||
// Counter.Report();
|
||||
// }
|
||||
} else if( this->HandOptDslash ) {
|
||||
#pragma omp for
|
||||
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dslashtime +=usecond();
|
||||
alltime+=usecond();
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
alltime-=usecond();
|
||||
Compressor compressor(dag);
|
||||
|
||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
int HT = GridThread::GetHyperThreads();
|
||||
int cores = GridThread::GetCores();
|
||||
int nwork = U._grid->oSites();
|
||||
|
||||
commtime -=usecond();
|
||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
||||
st.HaloExchangeComplete(handle);
|
||||
commtime +=usecond();
|
||||
|
||||
jointime -=usecond();
|
||||
st.HaloExchangeComplete(handle);
|
||||
jointime +=usecond();
|
||||
|
||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||
// Not loop ordering and data layout.
|
||||
// Designed to create
|
||||
// - per thread reuse in L1 cache for U
|
||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
||||
|
||||
local = false;
|
||||
nonlocal = true;
|
||||
dslash1time -=usecond();
|
||||
#pragma omp parallel
|
||||
{
|
||||
for(int jjj=0;jjj<100;jjj++){
|
||||
#pragma omp barrier
|
||||
dslashtime -=usecond();
|
||||
if ( dag == DaggerYes ) {
|
||||
if( this->HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
int sU=0;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
{
|
||||
int sd;
|
||||
for(sd=0;sd<Ls;sd++){
|
||||
int sU=ss;
|
||||
int sU=0;
|
||||
int sF = sd+Ls*sU;
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if( this->HandOptDslash ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
if( this->AsmOptDslash ) {
|
||||
// for(int i=0;i<1;i++){
|
||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
// PerformanceCounter Counter(i);
|
||||
// Counter.Start();
|
||||
|
||||
#pragma omp for
|
||||
for(int t=0;t<threads;t++){
|
||||
|
||||
int hyperthread = t%HT;
|
||||
int core = t/HT;
|
||||
|
||||
int sswork, swork,soff,ssoff, sU,sF;
|
||||
|
||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
||||
|
||||
for(int ss=0;ss<sswork;ss++){
|
||||
for(int s=soff;s<soff+swork;s++){
|
||||
|
||||
sU=0;
|
||||
sF = s+Ls*sU;
|
||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Counter.Stop();
|
||||
// Counter.Report();
|
||||
// }
|
||||
} else if( this->HandOptDslash ) {
|
||||
#pragma omp for
|
||||
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
int sU=0;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
#pragma omp for
|
||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||
int sU=ss;
|
||||
int sU=0;
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dslash1time +=usecond();
|
||||
}
|
||||
}
|
||||
dslashtime +=usecond();
|
||||
alltime+=usecond();
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
|
@ -1,3 +1,4 @@
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@ -120,6 +121,20 @@ namespace Grid {
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalOMPbench(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalL1bench(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalCommsThenCompute(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
@ -148,7 +163,7 @@ namespace Grid {
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Data members require to support the functionality
|
||||
///////////////////////////////////////////////////////////////
|
||||
protected:
|
||||
public:
|
||||
|
||||
// Add these to the support from Wilson
|
||||
GridBase *_FourDimGrid;
|
||||
|
@ -38,216 +38,177 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
|
||||
int num = 0;
|
||||
|
||||
result=zero;
|
||||
|
||||
///////////////////////////
|
||||
// Xp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Xp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if (SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjXp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjXp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
|
||||
accumReconXp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
|
||||
spReconXp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Yp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Yp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjYp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjYp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
|
||||
accumReconYp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
|
||||
accumReconYp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Zp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Zp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjZp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjZp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
|
||||
accumReconZp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
|
||||
accumReconZp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Tp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Tp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjTp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjTp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
|
||||
accumReconTp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
|
||||
accumReconTp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Xm
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Xm,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjXm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjXm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
|
||||
accumReconXm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
|
||||
accumReconXm(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Ym
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Ym,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjYm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjYm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
|
||||
accumReconYm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
|
||||
accumReconYm(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Zm
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Zm,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjZm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjZm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
|
||||
accumReconZm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
|
||||
accumReconZm(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Tm
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Tm,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjTm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjTm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
|
||||
accumReconTm(result,Uchi);
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
|
||||
accumReconTm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
|
||||
if ( local ) {
|
||||
vstream(out._odata[sF],result);
|
||||
} else if ( num ) {
|
||||
vstream(out._odata[sF],out._odata[sF]+result);
|
||||
}
|
||||
vstream(out._odata[sF],result);
|
||||
};
|
||||
|
||||
|
||||
@ -255,216 +216,177 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
SiteHalfSpinor tmp;
|
||||
SiteHalfSpinor chi;
|
||||
SiteHalfSpinor *chi_p;
|
||||
SiteHalfSpinor Uchi;
|
||||
SiteSpinor result;
|
||||
StencilEntry *SE;
|
||||
int ptype;
|
||||
|
||||
int num = 0;
|
||||
|
||||
result=zero;
|
||||
|
||||
///////////////////////////
|
||||
// Xp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Xm,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjXp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjXp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
|
||||
accumReconXp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
|
||||
spReconXp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Yp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Ym,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjYp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjYp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
|
||||
accumReconYp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
|
||||
accumReconYp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Zp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Zm,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjZp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjZp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
|
||||
accumReconZp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
|
||||
accumReconZp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Tp
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Tm,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjTp(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjTp(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
|
||||
accumReconTp(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
|
||||
accumReconTp(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Xm
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Xp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjXm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjXm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
|
||||
accumReconXm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
|
||||
accumReconXm(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Ym
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Yp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjYm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjYm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
|
||||
accumReconYm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
|
||||
accumReconYm(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Zm
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Zp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjZm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjZm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
|
||||
accumReconZm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
|
||||
accumReconZm(result,Uchi);
|
||||
|
||||
///////////////////////////
|
||||
// Tm
|
||||
///////////////////////////
|
||||
SE=st.GetEntry(ptype,Tp,sF);
|
||||
|
||||
if (local && SE->_is_local ) {
|
||||
if ( SE->_is_local ) {
|
||||
chi_p = χ
|
||||
if ( SE->_permute ) {
|
||||
spProjTm(tmp,in._odata[SE->_offset]);
|
||||
permute(chi,tmp,ptype);
|
||||
} else {
|
||||
spProjTm(chi,in._odata[SE->_offset]);
|
||||
}
|
||||
} else {
|
||||
chi_p=&buf[SE->_offset];
|
||||
}
|
||||
|
||||
if ( nonlocal && (!SE->_is_local) ) {
|
||||
chi=buf[SE->_offset];
|
||||
}
|
||||
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
|
||||
accumReconTm(result,Uchi);
|
||||
|
||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
||||
Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
|
||||
accumReconTm(result,Uchi);
|
||||
num++;
|
||||
}
|
||||
|
||||
if ( local ) {
|
||||
vstream(out._odata[sF],result);
|
||||
} else if ( num ) {
|
||||
vstream(out._odata[sF],out._odata[sF]+result);
|
||||
}
|
||||
vstream(out._odata[sF],result);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
@ -596,11 +518,11 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||
vstream(out._odata[sF],result);
|
||||
}
|
||||
|
||||
#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
|
||||
#if ( ! defined(IMCI) && ! defined(AVX512) )
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
}
|
||||
|
@ -48,11 +48,11 @@ namespace Grid {
|
||||
public:
|
||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
|
||||
int sF,int sU,const FermionField &in,FermionField &out);
|
||||
|
||||
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
@ -60,15 +60,15 @@ namespace Grid {
|
||||
|
||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
int sF,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
WilsonKernels(const ImplParams &p= ImplParams());
|
||||
|
||||
|
@ -28,76 +28,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
#if defined(AVX512) || defined (IMCI)
|
||||
//#if defined (IMCI)
|
||||
|
||||
#include <simd/Avx512Asm.h>
|
||||
#include <simd/Intel512wilson.h>
|
||||
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#undef ZEND
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef VZERO
|
||||
#undef VTIMESI
|
||||
#undef VTIMESMINUSI
|
||||
#include <simd/Intel512single.h>
|
||||
|
||||
#define VZERO(A) VZEROf(A)
|
||||
#define VMOV(A,B) VMOVf(A,B)
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
||||
|
||||
#define VADD(A,B,C) VADDf(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBf(A,B,C)
|
||||
#define VMUL(Uri,Uir,Chi,UChi,Z) VMULf(Uri,Uir,Chi,UChi,Z)
|
||||
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
|
||||
|
||||
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
||||
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
||||
|
||||
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
||||
|
||||
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
||||
|
||||
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
||||
|
||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
||||
|
||||
#define VPERM0(A,B) VPERM0f(A,B)
|
||||
#define VPERM1(A,B) VPERM1f(A,B)
|
||||
#define VPERM2(A,B) VPERM2f(A,B)
|
||||
#define VPERM3(A,B) VPERM3f(A,B)
|
||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
||||
|
||||
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||
|
||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||
|
||||
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
||||
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
||||
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
@ -105,7 +41,7 @@ namespace QCD {
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
uint64_t now;
|
||||
uint64_t first ;
|
||||
@ -127,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
|
||||
SE=st.GetEntry(ptype,Xm,ss);
|
||||
|
||||
#if 0
|
||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
LOAD64(%r9,pf);
|
||||
__asm__(
|
||||
VPREFETCH(0,%r9)
|
||||
VPREFETCH(1,%r9)
|
||||
VPREFETCH(2,%r9)
|
||||
VPREFETCH(3,%r9)
|
||||
VPREFETCH(4,%r9)
|
||||
VPREFETCH(5,%r9)
|
||||
VPREFETCH(6,%r9)
|
||||
VPREFETCH(7,%r9)
|
||||
VPREFETCH(8,%r9)
|
||||
VPREFETCH(9,%r9)
|
||||
VPREFETCH(10,%r9)
|
||||
VPREFETCH(11,%r9) );
|
||||
#endif
|
||||
|
||||
// Xm
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
@ -158,7 +74,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
if ( local ) {
|
||||
XM_PROJMEM(&plocal[offset]);
|
||||
XP_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -168,7 +84,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXM(Xm,pf);
|
||||
}
|
||||
XM_RECON;
|
||||
XP_RECON;
|
||||
|
||||
// Ym
|
||||
offset = SE->_offset;
|
||||
@ -181,7 +97,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
if ( local ) {
|
||||
YM_PROJMEM(&plocal[offset]);
|
||||
YP_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -191,7 +107,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYM(Ym,pf);
|
||||
}
|
||||
YM_RECON_ACCUM;
|
||||
YP_RECON_ACCUM;
|
||||
|
||||
// Zm
|
||||
offset = SE->_offset;
|
||||
@ -204,7 +120,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
if ( local ) {
|
||||
ZM_PROJMEM(&plocal[offset]);
|
||||
ZP_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -214,7 +130,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZM(Zm,pf);
|
||||
}
|
||||
ZM_RECON_ACCUM;
|
||||
ZP_RECON_ACCUM;
|
||||
|
||||
// Tm
|
||||
offset = SE->_offset;
|
||||
@ -227,7 +143,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
|
||||
|
||||
if ( local ) {
|
||||
TM_PROJMEM(&plocal[offset]);
|
||||
TP_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -237,7 +153,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTM(Tm,pf);
|
||||
}
|
||||
TM_RECON_ACCUM;
|
||||
TP_RECON_ACCUM;
|
||||
|
||||
// Tp
|
||||
offset = SE->_offset;
|
||||
@ -250,7 +166,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
if ( local ) {
|
||||
TP_PROJMEM(&plocal[offset]);
|
||||
TM_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -260,7 +176,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFTP(Tp,pf);
|
||||
}
|
||||
TP_RECON_ACCUM;
|
||||
TM_RECON_ACCUM;
|
||||
|
||||
// Zp
|
||||
offset = SE->_offset;
|
||||
@ -273,7 +189,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
if ( local ) {
|
||||
ZP_PROJMEM(&plocal[offset]);
|
||||
ZM_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -283,7 +199,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFZP(Zp,pf);
|
||||
}
|
||||
ZP_RECON_ACCUM;
|
||||
ZM_RECON_ACCUM;
|
||||
|
||||
|
||||
offset = SE->_offset;
|
||||
@ -296,7 +212,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
if ( local ) {
|
||||
YP_PROJMEM(&plocal[offset]);
|
||||
YM_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -306,22 +222,20 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFYP(Yp,pf);
|
||||
}
|
||||
YP_RECON_ACCUM;
|
||||
YM_RECON_ACCUM;
|
||||
|
||||
// Xp
|
||||
perm = SE->_permute;
|
||||
offset = SE->_offset;
|
||||
local = SE->_is_local;
|
||||
|
||||
// PREFETCH_R(A);
|
||||
|
||||
// Prefetch
|
||||
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
|
||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
||||
else pf=(void *)&pbuf[SE->_offset];
|
||||
|
||||
if ( local ) {
|
||||
XP_PROJMEM(&plocal[offset]);
|
||||
XM_PROJMEM(&plocal[offset]);
|
||||
if ( perm) {
|
||||
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
@ -331,7 +245,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
{
|
||||
MULT_2SPIN_DIR_PFXP(Xp,pf);
|
||||
}
|
||||
XP_RECON_ACCUM;
|
||||
XM_RECON_ACCUM;
|
||||
|
||||
debug:
|
||||
SAVE_RESULT(&out._odata[ss]);
|
||||
@ -340,6 +254,7 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
|
||||
template class WilsonKernels<WilsonImplF>;
|
||||
template class WilsonKernels<WilsonImplD>;
|
||||
|
||||
template class WilsonKernels<GparityWilsonImplF>;
|
||||
template class WilsonKernels<GparityWilsonImplD>;
|
||||
}}
|
||||
#endif
|
||||
|
@ -308,548 +308,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
namespace Grid {
|
||||
namespace QCD {
|
||||
|
||||
#if 0
|
||||
template<class Impl>
|
||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
{
|
||||
// std::cout << "Hand op Dhop "<<std::endl;
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
||||
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
||||
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
||||
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
||||
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
||||
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
||||
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
||||
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
||||
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
||||
|
||||
REGISTER Simd Chi_00; // two spinor; 6 regs
|
||||
REGISTER Simd Chi_01;
|
||||
REGISTER Simd Chi_02;
|
||||
|
||||
REGISTER Simd Chi_10;
|
||||
REGISTER Simd Chi_11;
|
||||
REGISTER Simd Chi_12; // 14 left
|
||||
|
||||
REGISTER Simd UChi_00; // two spinor; 6 regs
|
||||
REGISTER Simd UChi_01;
|
||||
REGISTER Simd UChi_02;
|
||||
|
||||
REGISTER Simd UChi_10;
|
||||
REGISTER Simd UChi_11;
|
||||
REGISTER Simd UChi_12; // 8 left
|
||||
|
||||
REGISTER Simd U_00; // two rows of U matrix
|
||||
REGISTER Simd U_10;
|
||||
REGISTER Simd U_20;
|
||||
REGISTER Simd U_01;
|
||||
REGISTER Simd U_11;
|
||||
REGISTER Simd U_21; // 2 reg left.
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset, ptype;
|
||||
int num = 0;
|
||||
|
||||
// Xp
|
||||
SE=st.GetEntry(ptype,Xp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xp);
|
||||
XP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Yp
|
||||
SE=st.GetEntry(ptype,Yp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Yp);
|
||||
YP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
|
||||
// Zp
|
||||
SE=st.GetEntry(ptype,Zp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zp);
|
||||
ZP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Tp
|
||||
SE=st.GetEntry(ptype,Tp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tp);
|
||||
TP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Xm
|
||||
SE=st.GetEntry(ptype,Xm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xm);
|
||||
XM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Ym
|
||||
SE=st.GetEntry(ptype,Ym,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Ym);
|
||||
YM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Zm
|
||||
SE=st.GetEntry(ptype,Zm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zm);
|
||||
ZM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Tm
|
||||
SE=st.GetEntry(ptype,Tm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tm);
|
||||
TM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
SiteSpinor & ref (out._odata[ss]);
|
||||
if ( Local ) {
|
||||
vstream(ref()(0)(0),result_00);
|
||||
vstream(ref()(0)(1),result_01);
|
||||
vstream(ref()(0)(2),result_02);
|
||||
vstream(ref()(1)(0),result_10);
|
||||
vstream(ref()(1)(1),result_11);
|
||||
vstream(ref()(1)(2),result_12);
|
||||
vstream(ref()(2)(0),result_20);
|
||||
vstream(ref()(2)(1),result_21);
|
||||
vstream(ref()(2)(2),result_22);
|
||||
vstream(ref()(3)(0),result_30);
|
||||
vstream(ref()(3)(1),result_31);
|
||||
vstream(ref()(3)(2),result_32);
|
||||
return 1;
|
||||
} else if ( num ) {
|
||||
vstream(ref()(0)(0),ref()(0)(0)+result_00);
|
||||
vstream(ref()(0)(1),ref()(0)(1)+result_01);
|
||||
vstream(ref()(0)(2),ref()(0)(2)+result_02);
|
||||
vstream(ref()(1)(0),ref()(1)(0)+result_10);
|
||||
vstream(ref()(1)(1),ref()(1)(1)+result_11);
|
||||
vstream(ref()(1)(2),ref()(1)(2)+result_12);
|
||||
vstream(ref()(2)(0),ref()(2)(0)+result_20);
|
||||
vstream(ref()(2)(1),ref()(2)(1)+result_21);
|
||||
vstream(ref()(2)(2),ref()(2)(2)+result_22);
|
||||
vstream(ref()(3)(0),ref()(3)(0)+result_30);
|
||||
vstream(ref()(3)(1),ref()(3)(1)+result_31);
|
||||
vstream(ref()(3)(2),ref()(3)(2)+result_32);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<class Impl>
|
||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
{
|
||||
// std::cout << "Hand op Dhop "<<std::endl;
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
|
||||
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
||||
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
||||
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
||||
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
||||
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
||||
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
||||
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
||||
|
||||
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
||||
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
||||
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
||||
|
||||
REGISTER Simd Chi_00; // two spinor; 6 regs
|
||||
REGISTER Simd Chi_01;
|
||||
REGISTER Simd Chi_02;
|
||||
|
||||
REGISTER Simd Chi_10;
|
||||
REGISTER Simd Chi_11;
|
||||
REGISTER Simd Chi_12; // 14 left
|
||||
|
||||
REGISTER Simd UChi_00; // two spinor; 6 regs
|
||||
REGISTER Simd UChi_01;
|
||||
REGISTER Simd UChi_02;
|
||||
|
||||
REGISTER Simd UChi_10;
|
||||
REGISTER Simd UChi_11;
|
||||
REGISTER Simd UChi_12; // 8 left
|
||||
|
||||
REGISTER Simd U_00; // two rows of U matrix
|
||||
REGISTER Simd U_10;
|
||||
REGISTER Simd U_20;
|
||||
REGISTER Simd U_01;
|
||||
REGISTER Simd U_11;
|
||||
REGISTER Simd U_21; // 2 reg left.
|
||||
|
||||
#define Chimu_00 Chi_00
|
||||
#define Chimu_01 Chi_01
|
||||
#define Chimu_02 Chi_02
|
||||
#define Chimu_10 Chi_10
|
||||
#define Chimu_11 Chi_11
|
||||
#define Chimu_12 Chi_12
|
||||
#define Chimu_20 UChi_00
|
||||
#define Chimu_21 UChi_01
|
||||
#define Chimu_22 UChi_02
|
||||
#define Chimu_30 UChi_10
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
|
||||
StencilEntry *SE;
|
||||
int offset, ptype;
|
||||
int num = 0;
|
||||
|
||||
// Xp
|
||||
SE=st.GetEntry(ptype,Xp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xp);
|
||||
XM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
|
||||
// Yp
|
||||
SE=st.GetEntry(ptype,Yp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Yp);
|
||||
YM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
|
||||
// Zp
|
||||
SE=st.GetEntry(ptype,Zp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zp);
|
||||
ZM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Tp
|
||||
SE=st.GetEntry(ptype,Tp,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TM_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tp);
|
||||
TM_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Xm
|
||||
SE=st.GetEntry(ptype,Xm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
XP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Xm);
|
||||
XP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Ym
|
||||
SE=st.GetEntry(ptype,Ym,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
YP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Ym);
|
||||
YP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Zm
|
||||
SE=st.GetEntry(ptype,Zm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
ZP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Zm);
|
||||
ZP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
// Tm
|
||||
SE=st.GetEntry(ptype,Tm,ss);
|
||||
offset = SE->_offset;
|
||||
|
||||
if (Local && SE->_is_local ) {
|
||||
LOAD_CHIMU;
|
||||
TP_PROJ;
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
}
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
LOAD_CHI;
|
||||
}
|
||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
||||
MULT_2SPIN(Tm);
|
||||
TP_RECON_ACCUM;
|
||||
num++;
|
||||
}
|
||||
|
||||
SiteSpinor & ref (out._odata[ss]);
|
||||
if ( Local ) {
|
||||
vstream(ref()(0)(0),result_00);
|
||||
vstream(ref()(0)(1),result_01);
|
||||
vstream(ref()(0)(2),result_02);
|
||||
vstream(ref()(1)(0),result_10);
|
||||
vstream(ref()(1)(1),result_11);
|
||||
vstream(ref()(1)(2),result_12);
|
||||
vstream(ref()(2)(0),result_20);
|
||||
vstream(ref()(2)(1),result_21);
|
||||
vstream(ref()(2)(2),result_22);
|
||||
vstream(ref()(3)(0),result_30);
|
||||
vstream(ref()(3)(1),result_31);
|
||||
vstream(ref()(3)(2),result_32);
|
||||
return 1;
|
||||
} else if ( num ) {
|
||||
vstream(ref()(0)(0),ref()(0)(0)+result_00);
|
||||
vstream(ref()(0)(1),ref()(0)(1)+result_01);
|
||||
vstream(ref()(0)(2),ref()(0)(2)+result_02);
|
||||
vstream(ref()(1)(0),ref()(1)(0)+result_10);
|
||||
vstream(ref()(1)(1),ref()(1)(1)+result_11);
|
||||
vstream(ref()(1)(2),ref()(1)(2)+result_12);
|
||||
vstream(ref()(2)(0),ref()(2)(0)+result_20);
|
||||
vstream(ref()(2)(1),ref()(2)(1)+result_21);
|
||||
vstream(ref()(2)(2),ref()(2)(2)+result_22);
|
||||
vstream(ref()(3)(0),ref()(3)(0)+result_30);
|
||||
vstream(ref()(3)(1),ref()(3)(1)+result_31);
|
||||
vstream(ref()(3)(2),ref()(3)(2)+result_32);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template<class Impl>
|
||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
typedef typename Simd::scalar_type S;
|
||||
typedef typename Simd::vector_type V;
|
||||
@ -1094,7 +557,7 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
|
||||
template<class Impl>
|
||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out,bool l, bool nl)
|
||||
int ss,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
// std::cout << "Hand op Dhop "<<std::endl;
|
||||
typedef typename Simd::scalar_type S;
|
||||
@ -1337,14 +800,13 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
////////////////////////////////////////////////
|
||||
// Specialise Gparity to simple implementation
|
||||
////////////////////////////////////////////////
|
||||
template<>
|
||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
||||
//check consistency of return types between these functions and the ones in WilsonKernels.cc
|
||||
@ -1355,7 +817,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Doub
|
||||
template<>
|
||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
return 0;
|
||||
@ -1364,7 +826,7 @@ int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,D
|
||||
template<>
|
||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
return 0;
|
||||
@ -1373,7 +835,7 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Doub
|
||||
template<>
|
||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
int sF,int sU,const FermionField &in, FermionField &out)
|
||||
{
|
||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
||||
return 0;
|
||||
@ -1383,29 +845,29 @@ int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,D
|
||||
|
||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
|
||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
||||
int ss,int sU,const FermionField &in, FermionField &out);
|
||||
|
||||
}}
|
||||
|
@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
|
||||
LatticeMatrix Umu(out._grid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
LieRandomize(pRNG,Umu,0.01);
|
||||
pokeLorentz(out,Umu,mu);
|
||||
PokeIndex<LorentzIndex>(out,Umu,mu);
|
||||
}
|
||||
}
|
||||
static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
|
||||
LatticeMatrix Umu(out._grid);
|
||||
Umu=1.0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
pokeLorentz(out,Umu,mu);
|
||||
PokeIndex<LorentzIndex>(out,Umu,mu);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <immintrin.h>
|
||||
|
||||
|
||||
|
||||
namespace Grid{
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
@ -246,26 +246,30 @@ namespace Optimization {
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
|
||||
//__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||
//return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
|
||||
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
//__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||
//return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline __m512 operator()(__m512 in, __m512 ret){
|
||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||
return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
|
||||
}
|
||||
//Complex double
|
||||
inline __m512d operator()(__m512d in, __m512d ret){
|
||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||
return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
|
||||
}
|
||||
|
||||
|
||||
@ -345,7 +349,7 @@ namespace Optimization {
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
namespace Grid {
|
||||
|
||||
typedef __m512 SIMD_Ftype; // Single precision type
|
||||
typedef __m512d SIMD_Dtype; // Double precision type
|
||||
typedef __m512i SIMD_Itype; // Integer type
|
||||
|
141
lib/simd/Intel512avx.h
Normal file
141
lib/simd/Intel512avx.h
Normal file
@ -0,0 +1,141 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_H
|
||||
#define GRID_ASM_AV512_H
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Knights Landing specials
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||
|
||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||
|
||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||
|
||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMULMEMf(O,P,B,Biirr) \
|
||||
VMULMEMf(O,P,C,Ciirr) \
|
||||
VMULf(tmp,B,Briir) \
|
||||
VMULf(tmp,C,Criir)
|
||||
|
||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMULMEMd(O,P,B,Biirr) \
|
||||
VMULMEMd(O,P,C,Ciirr) \
|
||||
VMULd(tmp,B,Briir) \
|
||||
VMULd(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMADDMEMf(O,P,B,Biirr) \
|
||||
VMADDMEMf(O,P,C,Ciirr) \
|
||||
VMADDf(tmp,B,Briir) \
|
||||
VMADDf(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMADDMEMd(O,P,B,Biirr) \
|
||||
VMADDMEMd(O,P,C,Ciirr) \
|
||||
VMADDd(tmp,B,Briir) \
|
||||
VMADDd(tmp,C,Criir)
|
||||
|
||||
// Merges accumulation for complex dot chain; less efficient under avx512
|
||||
#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||
"vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
|
||||
|
||||
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||
|
||||
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
|
||||
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
|
||||
|
||||
|
||||
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
|
||||
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n"
|
||||
#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n"
|
||||
#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
|
||||
#endif
|
92
lib/simd/Intel512avxAddsub.h
Normal file
92
lib/simd/Intel512avxAddsub.h
Normal file
@ -0,0 +1,92 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_ADDSUB_H
|
||||
#define GRID_ASM_AV512_ADDSUB_H
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Building blocks for SU3 x 2spinor
|
||||
// Load columns of U
|
||||
// 18 U DUP's rr/ii
|
||||
// 6 Chi shuffles ir,ri
|
||||
// 6muls, 30 fmaddsubs
|
||||
////////////////////////////////////////////////////////////////
|
||||
#define MULT_ADDSUB_2SPIN(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VMOVIDUPf(0,%r8,Z0 ) \
|
||||
VMOVIDUPf(3,%r8,Z1 )\
|
||||
VMOVIDUPf(6,%r8,Z2 )\
|
||||
VSHUFf(Chi_00,T1) \
|
||||
VSHUFf(Chi_10,T2) \
|
||||
\
|
||||
VMULf(Z0,T1,UChi_00) VMOVRDUPf(0,%r8,Z3 ) \
|
||||
VMULf(Z0,T2,UChi_10) VMOVRDUPf(3,%r8,Z4 ) \
|
||||
VMULf(Z1,T1,UChi_01) VMOVRDUPf(6,%r8,Z5 ) \
|
||||
VMULf(Z1,T2,UChi_11) VMOVIDUPf(1,%r8,Z0 ) \
|
||||
VMULf(Z2,T1,UChi_02) VMOVIDUPf(4,%r8,Z1 ) \
|
||||
VMULf(Z2,T2,UChi_12) VMOVIDUPf(7,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUBf(Z3,Chi_00,UChi_00) VSHUFf(Chi_01,T1) \
|
||||
VMADDSUBf(Z3,Chi_10,UChi_10) VSHUFf(Chi_11,T2) \
|
||||
VMADDSUBf(Z4,Chi_00,UChi_01) VMOVRDUPf(1,%r8,Z3 ) \
|
||||
VMADDSUBf(Z4,Chi_10,UChi_11)\
|
||||
VMADDSUBf(Z5,Chi_00,UChi_02) VMOVRDUPf(4,%r8,Z4 ) \
|
||||
VMADDSUBf(Z5,Chi_10,UChi_12)\
|
||||
\
|
||||
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(7,%r8,Z5 ) \
|
||||
VMADDSUBf(Z0,T2,UChi_10)\
|
||||
VMADDSUBf(Z1,T1,UChi_01) VMOVIDUPf(2,%r8,Z0 ) \
|
||||
VMADDSUBf(Z1,T2,UChi_11)\
|
||||
VMADDSUBf(Z2,T1,UChi_02) VMOVIDUPf(5,%r8,Z1 ) \
|
||||
VMADDSUBf(Z2,T2,UChi_12) VMOVIDUPf(8,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUBf(Z3,Chi_01,UChi_00) VSHUFf(Chi_02,T1) \
|
||||
VMADDSUBf(Z3,Chi_11,UChi_10) VSHUFf(Chi_12,T2) \
|
||||
VMADDSUBf(Z4,Chi_01,UChi_01) VMOVRDUPf(2,%r8,Z3 ) \
|
||||
VMADDSUBf(Z4,Chi_11,UChi_11)\
|
||||
VMADDSUBf(Z5,Chi_01,UChi_02) VMOVRDUPf(5,%r8,Z4 ) \
|
||||
VMADDSUBf(Z5,Chi_11,UChi_12)\
|
||||
\
|
||||
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(8,%r8,Z5 ) \
|
||||
VMADDSUBf(Z0,T2,UChi_10)\
|
||||
VMADDSUBf(Z1,T1,UChi_01)\
|
||||
VMADDSUBf(Z1,T2,UChi_11)\
|
||||
VMADDSUBf(Z2,T1,UChi_02)\
|
||||
VMADDSUBf(Z2,T2,UChi_12)\
|
||||
\
|
||||
VMADDSUBf(Z3,Chi_02,UChi_00)\
|
||||
VMADDSUBf(Z3,Chi_12,UChi_10)\
|
||||
VMADDSUBf(Z4,Chi_02,UChi_01)\
|
||||
VMADDSUBf(Z4,Chi_12,UChi_11)\
|
||||
VMADDSUBf(Z5,Chi_02,UChi_02)\
|
||||
VMADDSUBf(Z5,Chi_12,UChi_12)\
|
||||
);
|
||||
|
||||
|
||||
#endif
|
140
lib/simd/Intel512common.h
Normal file
140
lib/simd/Intel512common.h
Normal file
@ -0,0 +1,140 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
||||
#define GRID_ASM_INTEL_COMMON_512_H
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Opcodes common
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define MASK_REGS \
|
||||
__asm__ ("mov $0xAAAA, %%eax \n"\
|
||||
"kmovw %%eax, %%k6 \n"\
|
||||
"mov $0x5555, %%eax \n"\
|
||||
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
||||
|
||||
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
|
||||
#define VTIMESIf(A,DEST, Z) \
|
||||
VTIMESI0f(A,DEST, Z) \
|
||||
VTIMESI1f(A,DEST, Z) \
|
||||
VTIMESI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESId(A,DEST, Z) \
|
||||
VTIMESI0d(A,DEST, Z) \
|
||||
VTIMESI1d(A,DEST, Z) \
|
||||
VTIMESI2d(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSIf(A,DEST, Z) \
|
||||
VTIMESMINUSI0f(A,DEST, Z) \
|
||||
VTIMESMINUSI1f(A,DEST, Z) \
|
||||
VTIMESMINUSI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSId(A,DEST, Z) \
|
||||
VTIMESMINUSI0d(A,DEST, Z) \
|
||||
VTIMESMINUSI1d(A,DEST, Z) \
|
||||
VTIMESMINUSI2d(A,DEST, Z)
|
||||
|
||||
#define VACCTIMESIf(A,ACC,tmp) \
|
||||
VACCTIMESI0f(A,ACC,tmp) \
|
||||
VACCTIMESI1f(A,ACC,tmp) \
|
||||
VACCTIMESI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESId(A,ACC,tmp) \
|
||||
VACCTIMESI0d(A,ACC,tmp) \
|
||||
VACCTIMESI1d(A,ACC,tmp) \
|
||||
VACCTIMESI2d(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSIf(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSId(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||
|
||||
#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A );
|
||||
#define LOAD64(A,ptr) LOAD64i(A,ptr)
|
||||
|
||||
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||
|
||||
#define VPREFETCHG(O,A)
|
||||
#define VPREFETCHW(O,A)
|
||||
#define VEVICT(O,A)
|
||||
|
||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||
// "clevict0 "#O"*64("#A");\n"
|
||||
|
||||
#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
|
||||
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
|
||||
#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
|
||||
#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
|
||||
#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define VPREFETCHNTA(O,A)
|
||||
#define VPREFETCH(O,A)
|
||||
|
||||
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
|
||||
// Swaps Re/Im ; could unify this with IMCI
|
||||
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
|
||||
#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
|
||||
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1
|
||||
|
||||
#define TRAP " int3 ;\n"
|
||||
|
||||
#endif
|
135
lib/simd/Intel512double.h
Normal file
135
lib/simd/Intel512double.h
Normal file
@ -0,0 +1,135 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard can be multiply included as undef clearage
|
||||
#undef VZERO
|
||||
#undef VMOV
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#define VZERO(A) VZEROd(A)
|
||||
#define VMOV(A,B) VMOVd(A,B)
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
|
||||
|
||||
#undef VADD
|
||||
#undef VSUB
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#define VADD(A,B,C) VADDd(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBd(A,B,C)
|
||||
#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi)
|
||||
#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi)
|
||||
|
||||
|
||||
#undef VTIMESI
|
||||
#undef VTIMESI0
|
||||
#undef VTIMESI1
|
||||
#undef VTIMESI2
|
||||
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
|
||||
#define VTIMESI0(A,B,C) VTIMESI0d(A,B,C)
|
||||
#define VTIMESI1(A,B,C) VTIMESI1d(A,B,C)
|
||||
#define VTIMESI2(A,B,C) VTIMESI2d(A,B,C)
|
||||
|
||||
#undef VTIMESMINUSI
|
||||
#undef VTIMESMINUSI0
|
||||
#undef VTIMESMINUSI1
|
||||
#undef VTIMESMINUSI2
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
|
||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0d(A,B,C)
|
||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1d(A,B,C)
|
||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESI
|
||||
#undef VACCTIMESI0
|
||||
#undef VACCTIMESI1
|
||||
#undef VACCTIMESI2
|
||||
#define VACCTIMESI(A,B,C) VACCTIMESId(A,B,C)
|
||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0d(A,B,C)
|
||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1d(A,B,C)
|
||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESMINUSI
|
||||
#undef VACCTIMESMINUSI0
|
||||
#undef VACCTIMESMINUSI1
|
||||
#undef VACCTIMESMINUSI2
|
||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSId(A,B,C)
|
||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0d(A,B,C)
|
||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1d(A,B,C)
|
||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2d(A,B,C)
|
||||
|
||||
#undef VACCTIMESI1MEM
|
||||
#undef VACCTIMESI2MEM
|
||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMd(A,ACC,O,P)
|
||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMd(A,ACC,O,P)
|
||||
|
||||
#undef VACCTIMESMINUSI1MEM
|
||||
#undef VACCTIMESMINUSI2MEM
|
||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
|
||||
|
||||
#undef VPERM0
|
||||
#undef VPERM1
|
||||
#undef VPERM2
|
||||
#undef VPERM3
|
||||
#define VPERM0(A,B) VPERM0d(A,B)
|
||||
#define VPERM1(A,B) VPERM1d(A,B)
|
||||
#define VPERM2(A,B) VPERM2d(A,B)
|
||||
#define VPERM3(A,B) VPERM3d(A,B)
|
||||
|
||||
#undef VSHUFMEM
|
||||
#undef VADDMEM
|
||||
#undef VSUBMEM
|
||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMd(OFF,A,DEST)
|
||||
#define VADDMEM(O,A,B,C) VADDMEMd(O,A,B,C)
|
||||
#define VSUBMEM(O,A,B,C) VSUBMEMd(O,A,B,C)
|
||||
|
||||
#undef VMOVIDUP
|
||||
#undef VMOVRDUP
|
||||
#undef VMADDSUB
|
||||
#undef VSHUF
|
||||
#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
|
||||
#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
|
||||
#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum)
|
||||
#define VSHUF(A,B) VSHUFd(A,B)
|
||||
|
||||
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef ZMULMEM2SP
|
||||
#undef ZMADDMEM2SP
|
||||
|
||||
#define ZEND1(A,B,C) ZEND1d(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2d(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
127
lib/simd/Intel512imci.h
Normal file
127
lib/simd/Intel512imci.h
Normal file
@ -0,0 +1,127 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_H
|
||||
#define GRID_ASM_AV512_H
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Knights Corner specials
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||
|
||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||
|
||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||
|
||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMULMEMf(O,P,B,Biirr) \
|
||||
VMULMEMf(O,P,C,Ciirr) \
|
||||
VMULf(tmp,B,Briir) \
|
||||
VMULf(tmp,C,Criir)
|
||||
|
||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMULMEMd(O,P,B,Biirr) \
|
||||
VMULMEMd(O,P,C,Ciirr) \
|
||||
VMULd(tmp,B,Briir) \
|
||||
VMULd(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMADDMEMf(O,P,B,Biirr) \
|
||||
VMADDMEMf(O,P,C,Ciirr) \
|
||||
VMADDf(tmp,B,Briir) \
|
||||
VMADDf(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMADDMEMd(O,P,B,Biirr) \
|
||||
VMADDMEMd(O,P,C,Ciirr) \
|
||||
VMADDd(tmp,B,Briir) \
|
||||
VMADDd(tmp,C,Criir)
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define VTIMESI0f(A,DEST, Z)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
// Acc = Acc - i A
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
|
||||
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
|
||||
|
||||
#define VPERM0f(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vpermf32x4 $0xb1," #A "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vmovaps " #A "{badc}," #B ";\n"
|
||||
#define VPERM3f(A,B) "vmovaps " #A "{cdab}," #B ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1d(A,B) "vmovapd " #A "{badc}," #B ";\n"
|
||||
#define VPERM2d(A,B) "vmovapd " #A "{cdab}," #B ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
#endif
|
135
lib/simd/Intel512single.h
Normal file
135
lib/simd/Intel512single.h
Normal file
@ -0,0 +1,135 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Avx512Asm.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
// No guard can be multiply included as undef clearge of macros
|
||||
#undef VZERO
|
||||
#undef VMOV
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#define VZERO(A) VZEROf(A)
|
||||
#define VMOV(A,B) VMOVf(A,B)
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
||||
|
||||
#undef VADD
|
||||
#undef VSUB
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#define VADD(A,B,C) VADDf(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBf(A,B,C)
|
||||
#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi)
|
||||
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
|
||||
|
||||
|
||||
#undef VTIMESI
|
||||
#undef VTIMESI0
|
||||
#undef VTIMESI1
|
||||
#undef VTIMESI2
|
||||
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
||||
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
||||
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
||||
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
||||
|
||||
#undef VTIMESMINUSI
|
||||
#undef VTIMESMINUSI0
|
||||
#undef VTIMESMINUSI1
|
||||
#undef VTIMESMINUSI2
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESI
|
||||
#undef VACCTIMESI0
|
||||
#undef VACCTIMESI1
|
||||
#undef VACCTIMESI2
|
||||
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESMINUSI
|
||||
#undef VACCTIMESMINUSI0
|
||||
#undef VACCTIMESMINUSI1
|
||||
#undef VACCTIMESMINUSI2
|
||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
||||
|
||||
#undef VACCTIMESI1MEM
|
||||
#undef VACCTIMESI2MEM
|
||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
||||
|
||||
#undef VACCTIMESMINUSI1MEM
|
||||
#undef VACCTIMESMINUSI2MEM
|
||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
||||
|
||||
#undef VPERM0
|
||||
#undef VPERM1
|
||||
#undef VPERM2
|
||||
#undef VPERM3
|
||||
#define VPERM0(A,B) VPERM0f(A,B)
|
||||
#define VPERM1(A,B) VPERM1f(A,B)
|
||||
#define VPERM2(A,B) VPERM2f(A,B)
|
||||
#define VPERM3(A,B) VPERM3f(A,B)
|
||||
|
||||
#undef VSHUFMEM
|
||||
#undef VADDMEM
|
||||
#undef VSUBMEM
|
||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
||||
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
||||
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
||||
|
||||
#undef VMOVIDUP
|
||||
#undef VMOVRDUP
|
||||
#undef VMADDSUB
|
||||
#undef VSHUF
|
||||
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
|
||||
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
|
||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
||||
#define VSHUF(A,B) VSHUFf(A,B)
|
||||
|
||||
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef ZMULMEM2SP
|
||||
#undef ZMADDMEM2SP
|
||||
|
||||
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
|
@ -25,13 +25,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_ASM_AV512_H
|
||||
#define GRID_ASM_AV512_H
|
||||
|
||||
// Serialisation elimination:
|
||||
// i) ZEND -> ZEND1, ZEND2, 6 fold round robin.
|
||||
// ii) TimesI -> TimesI_1, TimesI_2, 6 fold round robin
|
||||
//
|
||||
#ifndef GRID_ASM_INTEL_512_QCD_H
|
||||
#define GRID_ASM_INTEL_512_QCD_H
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept
|
||||
@ -69,7 +64,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define UChi_12 %zmm23
|
||||
|
||||
#define Uir %zmm24
|
||||
//#define ONE %zmm24
|
||||
#define Uri %zmm25
|
||||
#define T1 %zmm24
|
||||
#define T2 %zmm25
|
||||
|
||||
#define Z0 %zmm26
|
||||
#define Z1 %zmm27
|
||||
@ -93,319 +91,29 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define Chimu_31 UChi_11
|
||||
#define Chimu_32 UChi_12
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// CONFIG IMCI/AVX512
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ASM_IMCI
|
||||
#undef ASM_AVX512
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Opcodes common to AVX512 and IMCI
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#define MASK_REGS \
|
||||
__asm__ ("mov $0xAAAA, %%eax \n"\
|
||||
"kmov %%eax, %%k6 \n"\
|
||||
"knot %%k6, %%k7 \n" : : : "%eax");
|
||||
|
||||
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||
|
||||
#define VTIMESIf(A,DEST, Z) \
|
||||
VTIMESI0f(A,DEST, Z) \
|
||||
VTIMESI1f(A,DEST, Z) \
|
||||
VTIMESI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESId(A,DEST, Z) \
|
||||
VTIMESI0d(A,DEST, Z) \
|
||||
VTIMESI1d(A,DEST, Z) \
|
||||
VTIMESI2d(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSIf(A,DEST, Z) \
|
||||
VTIMESMINUSI0f(A,DEST, Z) \
|
||||
VTIMESMINUSI1f(A,DEST, Z) \
|
||||
VTIMESMINUSI2f(A,DEST, Z)
|
||||
|
||||
#define VTIMESMINUSId(A,DEST, Z) \
|
||||
VTIMESMINUSI0d(A,DEST, Z) \
|
||||
VTIMESMINUSI1d(A,DEST, Z) \
|
||||
VTIMESMINUSI2d(A,DEST, Z)
|
||||
|
||||
#define VACCTIMESIf(A,ACC,tmp) \
|
||||
VACCTIMESI0f(A,ACC,tmp) \
|
||||
VACCTIMESI1f(A,ACC,tmp) \
|
||||
VACCTIMESI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESI1MEMf(A,ACC,O,P) "vaddps " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2MEMf(A,ACC,O,P) "vsubrps " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
|
||||
#define VACCTIMESMINUSI1MEMf(A,ACC,O,P) "vsubrps " #O"*64("#P"),"#A "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2MEMf(A,ACC,O,P) "vaddps " #O"*64("#P"),"#A "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESId(A,ACC,tmp) \
|
||||
VACCTIMESI0d(A,ACC,tmp) \
|
||||
VACCTIMESI1d(A,ACC,tmp) \
|
||||
VACCTIMESI2d(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSIf(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1f(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||
|
||||
#define VACCTIMESMINUSId(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI0d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI1d(A,ACC,tmp) \
|
||||
VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||
|
||||
#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A );
|
||||
#define LOAD64(A,ptr) LOAD64i(A,ptr)
|
||||
|
||||
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||
|
||||
// Field prefetch
|
||||
#define VPREFETCHNTA(O,A) "vprefetchnta "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
|
||||
#define VPREFETCH(O,A) "vprefetch0 "#O"*64("#A");\n" "vprefetch1 ("#O"+12)*64("#A");\n"
|
||||
#define VPREFETCHG(O,A)
|
||||
#define VPREFETCHW(O,A)
|
||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||
#define VEVICT(O,A)
|
||||
// "clevict0 "#O"*64("#A");\n"
|
||||
|
||||
#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||
|
||||
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
|
||||
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
|
||||
#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
|
||||
#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
|
||||
#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
|
||||
|
||||
#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||
|
||||
#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||
|
||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||
|
||||
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||
|
||||
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||
|
||||
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||
|
||||
// Need VSHUFMULMEMf,d for KNC
|
||||
// AVX512 friendly
|
||||
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMULMEMf(O,P,B,Biirr) \
|
||||
VMULMEMf(O,P,C,Ciirr) \
|
||||
VMULf(tmp,B,Briir) \
|
||||
VMULf(tmp,C,Criir)
|
||||
|
||||
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMULMEMd(O,P,B,Biirr) \
|
||||
VMULMEMd(O,P,C,Ciirr) \
|
||||
VMULd(tmp,B,Briir) \
|
||||
VMULd(tmp,C,Criir)
|
||||
|
||||
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||
VSHUFMEMf(O,P,tmp) \
|
||||
VMADDMEMf(O,P,B,Biirr) \
|
||||
VMADDMEMf(O,P,C,Ciirr) \
|
||||
VMADDf(tmp,B,Briir) \
|
||||
VMADDf(tmp,C,Criir)
|
||||
|
||||
#define TRAP " int3 ;\n"
|
||||
|
||||
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||
VSHUFMEMd(O,P,tmp) \
|
||||
VMADDMEMd(O,P,B,Biirr) \
|
||||
VMADDMEMd(O,P,C,Ciirr) \
|
||||
VMADDd(tmp,B,Briir) \
|
||||
VMADDd(tmp,C,Criir)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Lane swizzling changed between AVX512 and IMCI and requires arch dependent complex support
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// AVX512 special (Knights Landing)
|
||||
#ifdef ASM_AVX512
|
||||
#define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
// Swaps Re/Im
|
||||
#define VSHUFd(A,DEST) "vshufpd $0x5, " #A "," #A "," #DEST ";\n"
|
||||
#define VSHUFf(A,DEST) "vshufps $0x55," #A "," #A "," #DEST ";\n"
|
||||
// Memops are useful for optimisation
|
||||
#define VSHUFMEMd(OFF,A,DEST) "vpshufpd $0x4e, " #OFF"("#A ")," #DEST ";\n"
|
||||
#define VSHUFMEMf(OFF,A,DEST) "vpshufps $0xb1, " #OFF"("#A ")," #DEST ";\n"
|
||||
|
||||
|
||||
// Merges accumulation for complex dot chain
|
||||
// TODO: 12 operation saving:
|
||||
// # could SWIZ op 18{cdab} and eliminate temporary // 12cycles
|
||||
// # no use KNL though. Fingour something else there.
|
||||
// # All swizzles become perms ops, but gain addsub; subadd must use this
|
||||
// # uint32_t (0x7F << 23 )
|
||||
// # uint64_t (0x3FF<< 52 ) ; vpbroadcast
|
||||
#define ZEND1f(Criir,Ciirr, tmp) \
|
||||
"vshufps $0xb1," #Ciirr "," #Criir "," #tmp ";\n"\
|
||||
"vaddps " #Criir "," #tmp "," #Criir"{%k6}" ";\n"
|
||||
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND2d(Criir,Ciirr, tmp) \
|
||||
"vshufpd $0x33," #Ciirr "," #Criir "," #tmp ";\n"\
|
||||
"vaddpd " #Criir "," #tmp "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "," #tmp "," #Criir"{%k7}" ";\n"
|
||||
|
||||
// Further opt possible: KNC -- use swizzle operand ; no addsub.
|
||||
// KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub
|
||||
// no swizzle on KNL.
|
||||
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VPERM0f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
|
||||
#define VPERM1f(A,B) "vshuff32x4 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
|
||||
#define VPERM2f(A,B) "vshufps " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
|
||||
#define VPERM3f(A,B) "vshufps " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(1,0,3,2) ";\n"
|
||||
#define VPERM1d(A,B) "vshuff64x2 " #A "," #B "," "#B" ", " #_MM_SELECT_FOUR_FOUR(2,3,0,1) ";\n"
|
||||
#define VPERM2d(A,B) "vshufpd " #A "," #B "," "#B" ", " 0x55 ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
#include <simd/Intel512common.h>
|
||||
#ifdef AVX512
|
||||
#include <simd/Intel512avx.h>
|
||||
//#include <simd/Intel512avxAddsub.h> // Alternate implementation
|
||||
#endif
|
||||
#ifdef IMCI
|
||||
#include <simd/Intel512imci.h>
|
||||
#endif
|
||||
|
||||
// Knights Corner specials
|
||||
#ifdef ASM_IMCI
|
||||
#define VSTOREf(OFF,PTR,SRC) "vmovnrngoaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
#define VSTOREd(OFF,PTR,SRC) "vmovnrngoapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
//#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
//#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Macros used to build wilson kernel -- can rationalise and simplify
|
||||
// a little as some duplication developed during trying different
|
||||
// variants during optimisation. Could cut back to only those used.
|
||||
//////////////////////////////////////////////////////////////////
|
||||
|
||||
#define VSHUFf(A,DEST) "vmovaps " #A "{cdab} , " #DEST ";\n"
|
||||
#define VSHUFd(A,DEST) "vmovapd " #A "{cdab} , " #DEST ";\n"
|
||||
|
||||
// Memops are useful for optimisation
|
||||
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n"
|
||||
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n"
|
||||
|
||||
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
#define ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||
|
||||
// Further opt possible: KNC -- use swizzle operand ; no addsub.
|
||||
// KNL -- addsub. Saves 6 ops, 12 cycles; KNL cost of loading "1" as only fmaddsub
|
||||
// no swizzle on KNL.
|
||||
#define VTIMESI0f(A,DEST, Z)
|
||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESI0d(A,DEST, Z)
|
||||
#define VTIMESI1d(A,DEST, Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESI2d(A,DEST, Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0f(A,DEST,Z)
|
||||
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VTIMESMINUSI0d(A,DEST,Z)
|
||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0f(A,ACC,tmp)
|
||||
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESI0d(A,ACC,tmp)
|
||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0f(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
#define VACCTIMESMINUSI0d(A,ACC,tmp)
|
||||
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||
|
||||
//#define ZENDf(Criir,Ciirr, tmp)
|
||||
|
||||
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
|
||||
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
|
||||
|
||||
#define VPERM0f(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1f(A,B) "vpermf32x4 $0xb1," #A "," #B ";\n"
|
||||
#define VPERM2f(A,B) "vmovaps " #A "{badc}," #B ";\n"
|
||||
#define VPERM3f(A,B) "vmovaps " #A "{cdab}," #B ";\n"
|
||||
|
||||
#define VPERM0d(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||
#define VPERM1d(A,B) "vmovapd " #A "{badc}," #B ";\n"
|
||||
#define VPERM2d(A,B) "vmovapd " #A "{cdab}," #B ";\n"
|
||||
#define VPERM3d(A,B) VMOVd(A,B)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// const SiteSpinor * ptr = & in._odata[offset];
|
||||
#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR)
|
||||
#define LOAD_CHI(PTR) LOAD_CHIi(PTR)
|
||||
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
|
||||
|
||||
#define LOAD_CHIMUi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ (\
|
||||
#define LOAD_CHIMUi \
|
||||
LOAD_CHIMU01i \
|
||||
LOAD_CHIMU23i );
|
||||
|
||||
@ -437,16 +145,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
// const SiteHalfSpinor *ptr = &buf[offset];
|
||||
|
||||
#define LOAD_CHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
#define LOAD_CHIi \
|
||||
VLOAD(0,%r8,Chi_00) \
|
||||
VLOAD(1,%r8,Chi_01) \
|
||||
VLOAD(2,%r8,Chi_02) \
|
||||
VLOAD(3,%r8,Chi_10) \
|
||||
VLOAD(4,%r8,Chi_11) \
|
||||
VLOAD(5,%r8,Chi_12) \
|
||||
);
|
||||
VLOAD(5,%r8,Chi_12)
|
||||
|
||||
|
||||
#define SAVE_UCHIi(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
@ -495,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
|
||||
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
|
||||
// Need detailed profile data to be sure.
|
||||
|
||||
#if 0
|
||||
#define PREFETCH_U(A) \
|
||||
LOAD64(%r8,&U._odata[sU](A)) \
|
||||
__asm__ ( \
|
||||
@ -524,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VPREFETCHW(9,%r8) \
|
||||
VPREFETCHW(10,%r8) \
|
||||
VPREFETCHW(11,%r8) );
|
||||
|
||||
#endif
|
||||
|
||||
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
|
||||
|
||||
@ -538,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
||||
|
||||
#if 0
|
||||
#define MULT_2SPIN_UNOPT(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
@ -583,19 +290,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
ZEND2(UChi_11,Z3,Chi_10) \
|
||||
ZEND2(UChi_02,Z4,Chi_02) \
|
||||
ZEND2(UChi_12,Z5,Chi_12) );
|
||||
#endif
|
||||
|
||||
#define MULT_2SPIN(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
|
||||
|
||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
|
||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
|
||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
|
||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
|
||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
|
||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
|
||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCH)
|
||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN_PF(ptr,pf,VPREFETCHNTA)
|
||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
|
||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
|
||||
|
||||
// MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
|
||||
|
||||
#if 0
|
||||
#define MULT_2SPIN_PF(ptr,pf,VPF) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
@ -636,8 +344,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
ZEND2(UChi_02,Z4,Chi_02) \
|
||||
VPF(11,%r9) \
|
||||
ZEND2(UChi_12,Z5,Chi_12) );
|
||||
#endif
|
||||
|
||||
|
||||
#if 0
|
||||
#define MULT_2SPIN_PFNONE(ptr,pf,VPF) \
|
||||
LOAD64(%r8,ptr) \
|
||||
LOAD64(%r9,pf) \
|
||||
@ -657,7 +366,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VPF(9,%r9) \
|
||||
VPF(10,%r9) \
|
||||
VPF(11,%r9) );
|
||||
|
||||
#endif
|
||||
|
||||
// Pretty much Perfectly Pipelined
|
||||
|
||||
@ -667,56 +376,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||
//define VTIMESIf(A,DEST, Z)
|
||||
// These don't work if DEST==Z. FIXME.
|
||||
#define XP_PROJ __asm__ ( \
|
||||
VACCTIMESI(Chimu_30,Chi_00,Z0) \
|
||||
VACCTIMESI(Chimu_31,Chi_01,Z1) \
|
||||
VACCTIMESI(Chimu_32,Chi_02,Z2) \
|
||||
VACCTIMESI(Chimu_20,Chi_10,Z3) \
|
||||
VACCTIMESI(Chimu_21,Chi_11,Z4) \
|
||||
VACCTIMESI(Chimu_22,Chi_12,Z5) );
|
||||
|
||||
#define XP_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESI1MEM(Chimu_30,Chi_00,0,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_31,Chi_01,1,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_32,Chi_02,2,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_20,Chi_10,3,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_21,Chi_11,4,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_22,Chi_12,5,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_30,Chi_00,0,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_31,Chi_01,1,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_32,Chi_02,2,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_20,Chi_10,3,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_21,Chi_11,4,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_22,Chi_12,5,%r8) );
|
||||
|
||||
|
||||
#define YP_PROJ __asm__ ( \
|
||||
VSUB(Chimu_30,Chimu_00,Chi_00)\
|
||||
VSUB(Chimu_31,Chimu_01,Chi_01)\
|
||||
VSUB(Chimu_32,Chimu_02,Chi_02)\
|
||||
VADD(Chimu_10,Chimu_20,Chi_10)\
|
||||
VADD(Chimu_11,Chimu_21,Chi_11)\
|
||||
VADD(Chimu_12,Chimu_22,Chi_12) );
|
||||
|
||||
#define EVICT_SPINOR(reg) \
|
||||
VEVICT(0,reg) \
|
||||
VEVICT(1,reg) \
|
||||
VEVICT(2,reg) \
|
||||
VEVICT(3,reg) \
|
||||
VEVICT(4,reg) \
|
||||
VEVICT(5,reg) \
|
||||
VEVICT(6,reg) \
|
||||
VEVICT(7,reg) \
|
||||
VEVICT(8,reg) \
|
||||
VEVICT(9,reg) \
|
||||
VEVICT(9,reg) \
|
||||
VEVICT(10,reg) \
|
||||
VEVICT(11,reg)
|
||||
VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \
|
||||
VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \
|
||||
VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \
|
||||
VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \
|
||||
VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \
|
||||
VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \
|
||||
VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \
|
||||
VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \
|
||||
VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \
|
||||
VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \
|
||||
VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \
|
||||
VACCTIMESI2(Chi_12,Chi_12,Chimu_22) );
|
||||
|
||||
|
||||
#define YP_PROJMEM(ptr) \
|
||||
@ -729,43 +405,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VADDMEM(6,%r8,Chimu_10,Chi_10) \
|
||||
VADDMEM(7,%r8,Chimu_11,Chi_11) \
|
||||
VADDMEM(8,%r8,Chimu_12,Chi_12) );
|
||||
// EVICT_SPINOR(%r8) );
|
||||
|
||||
#define ZP_PROJ __asm__ ( \
|
||||
VACCTIMESI(Chimu_20,Chi_00,Z0) \
|
||||
VACCTIMESI(Chimu_21,Chi_01,Z1) \
|
||||
VACCTIMESI(Chimu_22,Chi_02,Z2) \
|
||||
VACCTIMESMINUSI(Chimu_30,Chi_10,Z3) \
|
||||
VACCTIMESMINUSI(Chimu_31,Chi_11,Z4) \
|
||||
VACCTIMESMINUSI(Chimu_32,Chi_12,Z5) );
|
||||
|
||||
#define ZP_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
LOAD_CHIi \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESI1MEM(Chimu_20,Chi_00,0,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_21,Chi_01,1,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_22,Chi_02,2,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_30,Chi_10,3,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_31,Chi_11,4,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_32,Chi_12,5,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_20,Chi_00,0,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_21,Chi_01,1,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_22,Chi_02,2,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_30,Chi_10,3,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_31,Chi_11,4,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_32,Chi_12,5,%r8) \
|
||||
EVICT_SPINOR(%r8) );
|
||||
|
||||
|
||||
|
||||
#define TP_PROJ __asm__ ( \
|
||||
VADD(Chimu_00,Chimu_20,Chi_00) \
|
||||
VADD(Chimu_01,Chimu_21,Chi_01) \
|
||||
VADD(Chimu_02,Chimu_22,Chi_02) \
|
||||
VADD(Chimu_10,Chimu_30,Chi_10) \
|
||||
VADD(Chimu_11,Chimu_31,Chi_11) \
|
||||
VADD(Chimu_12,Chimu_32,Chi_12) );
|
||||
VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \
|
||||
VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \
|
||||
VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \
|
||||
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \
|
||||
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \
|
||||
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \
|
||||
VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \
|
||||
VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \
|
||||
VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \
|
||||
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \
|
||||
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \
|
||||
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) );
|
||||
|
||||
|
||||
#define TP_PROJMEM(ptr) \
|
||||
@ -777,44 +434,28 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VADDMEM(8,%r8,Chimu_02,Chi_02) \
|
||||
VADDMEM(9,%r8,Chimu_10,Chi_10) \
|
||||
VADDMEM(10,%r8,Chimu_11,Chi_11) \
|
||||
VADDMEM(11,%r8,Chimu_12,Chi_12) \
|
||||
EVICT_SPINOR(%r8) );
|
||||
|
||||
VADDMEM(11,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
// hspin(0)=fspin(0)-timesI(fspin(3))
|
||||
// hspin(1)=fspin(1)-timesI(fspin(2))
|
||||
#define XM_PROJ __asm__ ( \
|
||||
VACCTIMESMINUSI(Chimu_30,Chi_00,Z0) \
|
||||
VACCTIMESMINUSI(Chimu_31,Chi_01,Z1) \
|
||||
VACCTIMESMINUSI(Chimu_32,Chi_02,Z2) \
|
||||
VACCTIMESMINUSI(Chimu_20,Chi_10,Z3) \
|
||||
VACCTIMESMINUSI(Chimu_21,Chi_11,Z4) \
|
||||
VACCTIMESMINUSI(Chimu_22,Chi_12,Z5) );
|
||||
|
||||
#define XM_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
LOAD64(%r8,PTR)\
|
||||
__asm__ ( \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESMINUSI1MEM(Chimu_30,Chi_00,0,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_31,Chi_01,1,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_32,Chi_02,2,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_20,Chi_10,3,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_21,Chi_11,4,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_22,Chi_12,5,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_30,Chi_00,0,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_31,Chi_01,1,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_32,Chi_02,2,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_20,Chi_10,3,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_21,Chi_11,4,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_22,Chi_12,5,%r8) );
|
||||
|
||||
#define YM_PROJ __asm__ ( \
|
||||
VADD(Chimu_00,Chimu_30,Chi_00)\
|
||||
VADD(Chimu_01,Chimu_31,Chi_01)\
|
||||
VADD(Chimu_02,Chimu_32,Chi_02)\
|
||||
VSUB(Chimu_20,Chimu_10,Chi_10)\
|
||||
VSUB(Chimu_21,Chimu_11,Chi_11)\
|
||||
VSUB(Chimu_22,Chimu_12,Chi_12) );
|
||||
LOAD_CHIi \
|
||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
||||
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
|
||||
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
|
||||
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
|
||||
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
|
||||
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
|
||||
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
|
||||
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
|
||||
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
|
||||
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
|
||||
|
||||
#define YM_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
@ -825,45 +466,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VADDMEM(11,%r8,Chimu_02,Chi_02) \
|
||||
VSUBMEM(6,%r8,Chimu_10,Chi_10) \
|
||||
VSUBMEM(7,%r8,Chimu_11,Chi_11) \
|
||||
VSUBMEM(8,%r8,Chimu_12,Chi_12) \
|
||||
EVICT_SPINOR(%r8) );
|
||||
|
||||
|
||||
#define ZM_PROJ __asm__ ( \
|
||||
VACCTIMESMINUSI(Chimu_20,Chi_00,Z0)\
|
||||
VACCTIMESMINUSI(Chimu_21,Chi_01,Z1)\
|
||||
VACCTIMESMINUSI(Chimu_22,Chi_02,Z2)\
|
||||
VACCTIMESI(Chimu_30,Chi_10,Z3)\
|
||||
VACCTIMESI(Chimu_31,Chi_11,Z4)\
|
||||
VACCTIMESI(Chimu_32,Chi_12,Z5));
|
||||
VSUBMEM(8,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
#define ZM_PROJMEM(PTR) \
|
||||
LOAD64(%r8,PTR) \
|
||||
__asm__ ( \
|
||||
SHUF_CHIMU23i \
|
||||
VACCTIMESMINUSI1MEM(Chimu_20,Chi_00,0,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_21,Chi_01,1,%r8) \
|
||||
VACCTIMESMINUSI1MEM(Chimu_22,Chi_02,2,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_30,Chi_10,3,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_31,Chi_11,4,%r8) \
|
||||
VACCTIMESI1MEM(Chimu_32,Chi_12,5,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_20,Chi_00,0,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_21,Chi_01,1,%r8) \
|
||||
VACCTIMESMINUSI2MEM(Chimu_22,Chi_02,2,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_30,Chi_10,3,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_31,Chi_11,4,%r8) \
|
||||
VACCTIMESI2MEM(Chimu_32,Chi_12,5,%r8) \
|
||||
EVICT_SPINOR(%r8) );
|
||||
|
||||
|
||||
#define TM_PROJ __asm__ ( \
|
||||
VSUB(Chimu_20,Chimu_00,Chi_00)\
|
||||
VSUB(Chimu_21,Chimu_01,Chi_01)\
|
||||
VSUB(Chimu_22,Chimu_02,Chi_02)\
|
||||
VSUB(Chimu_30,Chimu_10,Chi_10)\
|
||||
VSUB(Chimu_31,Chimu_11,Chi_11)\
|
||||
VSUB(Chimu_32,Chimu_12,Chi_12) );
|
||||
|
||||
LOAD_CHIi \
|
||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
||||
VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
|
||||
VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
|
||||
VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
|
||||
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
|
||||
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
|
||||
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
|
||||
VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
|
||||
VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
|
||||
VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
|
||||
|
||||
#define TM_PROJMEM(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
@ -874,8 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
|
||||
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
|
||||
VSUBMEM(10,%r8,Chimu_11,Chi_11) \
|
||||
VSUBMEM(11,%r8,Chimu_12,Chi_12) \
|
||||
EVICT_SPINOR(%r8) );
|
||||
VSUBMEM(11,%r8,Chimu_12,Chi_12) );
|
||||
|
||||
// fspin(0)=hspin(0)
|
||||
// fspin(1)=hspin(1)
|
||||
@ -1102,7 +722,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VSUB(UChi_11,result_31,result_31)\
|
||||
VSUB(UChi_12,result_32,result_32) );
|
||||
|
||||
#define PREFETCH_CHIMU(A)
|
||||
//define PREFETCH_CHIMU(A)
|
||||
|
||||
#define PERMUTE_DIR0 __asm__ ( \
|
||||
VPERM0(Chi_00,Chi_00) \
|
||||
@ -1136,4 +756,65 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
VPERM3(Chi_11,Chi_11) \
|
||||
VPERM3(Chi_12,Chi_12) );
|
||||
|
||||
#define MULT_ADDSUB_2SPIN1(ptr) \
|
||||
LOAD64(%r8,ptr)
|
||||
/*
|
||||
* __asm__ ( \
|
||||
);
|
||||
VMUL(Z0,%zmm2,%zmm3) \
|
||||
*/
|
||||
#define MULT_ADDSUB_2SPIN(ptr) \
|
||||
LOAD64(%r8,ptr) \
|
||||
__asm__ ( \
|
||||
VMOVIDUP(0,%r8,Z0 ) \
|
||||
VMOVIDUP(3,%r8,Z1 )\
|
||||
VMOVIDUP(6,%r8,Z2 )\
|
||||
VSHUF(Chi_00,T1) \
|
||||
VSHUF(Chi_10,T2) \
|
||||
\
|
||||
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
||||
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
||||
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
||||
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||
VMADDSUB(Z3,Chi_10,UChi_10) VSHUF(Chi_11,T2) \
|
||||
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_10,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_10,UChi_12)\
|
||||
\
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10)\
|
||||
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
||||
VMADDSUB(Z1,T2,UChi_11)\
|
||||
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||
\
|
||||
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||
VMADDSUB(Z3,Chi_11,UChi_10) VSHUF(Chi_12,T2) \
|
||||
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
||||
VMADDSUB(Z4,Chi_11,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||
VMADDSUB(Z5,Chi_11,UChi_12)\
|
||||
\
|
||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||
VMADDSUB(Z0,T2,UChi_10)\
|
||||
VMADDSUB(Z1,T1,UChi_01)\
|
||||
VMADDSUB(Z1,T2,UChi_11)\
|
||||
VMADDSUB(Z2,T1,UChi_02)\
|
||||
VMADDSUB(Z2,T2,UChi_12)\
|
||||
\
|
||||
VMADDSUB(Z3,Chi_02,UChi_00)\
|
||||
VMADDSUB(Z3,Chi_12,UChi_10)\
|
||||
VMADDSUB(Z4,Chi_02,UChi_01)\
|
||||
VMADDSUB(Z4,Chi_12,UChi_11)\
|
||||
VMADDSUB(Z5,Chi_02,UChi_02)\
|
||||
VMADDSUB(Z5,Chi_12,UChi_12)\
|
||||
);
|
||||
|
||||
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
|
||||
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
|
||||
bin_PROGRAMS = Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd
|
||||
bin_PROGRAMS += Test_GaugeAction Test_RectPlaq Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_gp_rect_force Test_gparity Test_gpdwf_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd
|
||||
|
||||
|
||||
Test_GaugeAction_SOURCES=Test_GaugeAction.cc
|
||||
|
@ -20,6 +20,8 @@ endif
|
||||
|
||||
if BUILD_ZMM
|
||||
bin_PROGRAMS=Test_zmm
|
||||
else
|
||||
bin_PROGRAMS=
|
||||
endif
|
||||
|
||||
include Make.inc
|
||||
|
@ -42,6 +42,8 @@ public:
|
||||
int, domaindecompose,
|
||||
int, domainsize,
|
||||
int, order,
|
||||
int, Ls,
|
||||
double, mq,
|
||||
double, lo,
|
||||
double, hi,
|
||||
int, steps);
|
||||
@ -327,7 +329,7 @@ public:
|
||||
CoarseVector Ctmp(_CoarseOperator.Grid());
|
||||
CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
|
||||
|
||||
ConjugateGradient<CoarseVector> CG(1.0e-2,100000);
|
||||
ConjugateGradient<CoarseVector> CG(3.0e-3,100000);
|
||||
// ConjugateGradient<FineField> fCG(3.0e-2,1000);
|
||||
|
||||
HermitianLinearOperator<CoarseOperator,CoarseVector> HermOp(_CoarseOperator);
|
||||
@ -474,7 +476,7 @@ int main (int argc, char ** argv)
|
||||
read(RD,"params",params);
|
||||
std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
|
||||
|
||||
const int Ls=16;
|
||||
const int Ls=params.Ls;
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
@ -536,7 +538,7 @@ int main (int argc, char ** argv)
|
||||
// SU3::HotConfiguration(RNG4,Umu);
|
||||
// Umu=zero;
|
||||
|
||||
RealD mass=0.001;
|
||||
RealD mass=params.mq;
|
||||
RealD M5=1.8;
|
||||
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
@ -558,7 +560,8 @@ int main (int argc, char ** argv)
|
||||
assert ( (nbasis & 0x1)==0);
|
||||
int nb=nbasis/2;
|
||||
std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
|
||||
Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
|
||||
// Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
|
||||
Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
|
||||
for(int n=0;n<nb;n++){
|
||||
G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
|
||||
std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
|
||||
@ -594,7 +597,7 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
|
||||
ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
|
||||
CG(PosdefLdop,c_src,c_res);
|
||||
// CG(PosdefLdop,c_src,c_res);
|
||||
|
||||
// std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
// std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
|
||||
@ -619,17 +622,17 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
Precon.SmootherTest(src);
|
||||
// Precon.SmootherTest(src);
|
||||
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
PreconDD.SmootherTest(src);
|
||||
// PreconDD.SmootherTest(src);
|
||||
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
PreconDD.SAP(src,result);
|
||||
// PreconDD.SAP(src,result);
|
||||
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
|
||||
@ -657,18 +660,18 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
|
||||
result=zero;
|
||||
std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
||||
PGCRDD(HermIndefOp,src,result);
|
||||
// PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
|
||||
// result=zero;
|
||||
// std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
||||
// PGCRDD(HermIndefOp,src,result);
|
||||
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
// PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
|
||||
// std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
||||
// result=zero;
|
||||
// PGCR(HermIndefOp,src,result);
|
||||
PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
|
||||
std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
||||
result=zero;
|
||||
PGCR(HermIndefOp,src,result);
|
||||
|
||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||
std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
|
||||
|
@ -145,7 +145,7 @@ void Tester(const functor &func)
|
||||
|
||||
int ok=0;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( abs(reference[i]-result[i])>0){
|
||||
if ( abs(reference[i]-result[i])>1.0e-7){
|
||||
std::cout<<GridLogMessage<< "*****" << std::endl;
|
||||
std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
|
||||
ok++;
|
||||
|
@ -27,15 +27,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#include <Grid.h>
|
||||
#include <PerfCount.h>
|
||||
#include <simd/Avx512Asm.h>
|
||||
#include <simd/Intel512wilson.h>
|
||||
|
||||
|
||||
using namespace Grid;
|
||||
using namespace Grid::QCD;
|
||||
|
||||
void ZmulF(void *ptr1,void *ptr2,void *ptr3);
|
||||
void Zmul(void *ptr1,void *ptr2,void *ptr3);
|
||||
void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
|
||||
void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
|
||||
void TimesIAvx512F(void *ptr1,void *ptr3);
|
||||
void TimesIAvx512(void *ptr1,void *ptr3);
|
||||
void TimesMinusIAvx512F(void *ptr1,void *ptr3);
|
||||
void TimesMinusIAvx512(void *ptr1,void *ptr3);
|
||||
|
||||
|
||||
|
||||
@ -63,50 +68,106 @@ int main(int argc,char **argv)
|
||||
|
||||
vColourMatrixD mat;
|
||||
vHalfSpinColourVectorD vec;
|
||||
vHalfSpinColourVectorD vec1;
|
||||
vHalfSpinColourVectorD vec2;
|
||||
vHalfSpinColourVectorD vec3;
|
||||
|
||||
vHalfSpinColourVectorD matvec;
|
||||
vHalfSpinColourVectorD ref;
|
||||
vComplexD err;
|
||||
|
||||
random(sRNG,vec1);
|
||||
vec1 = std::complex<double>(0.1,3.0);
|
||||
random(sRNG,vec2);
|
||||
vec2=2.0;
|
||||
random(sRNG,vec3);
|
||||
|
||||
//std::cout << "Zmul vec1"<<vec1<<" &vec1 "<<& vec1<<std::endl;
|
||||
//std::cout << "Zmul vec2"<<vec2<<" &vec2 "<<& vec2<<std::endl;
|
||||
//std::cout << "Zmul vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
|
||||
for(int sp=0;sp<2;sp++){
|
||||
for(int co=0;co<3;co++){
|
||||
ref()(sp)(co) = vec1()(sp)(co)*vec2()(sp)(co);
|
||||
}}
|
||||
|
||||
Zmul((void *)&vec1,(void *)&vec2,(void *)&vec3);
|
||||
//std::cout << "Zmul vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
|
||||
//std::cout << "Zmul \n\t ref "<<ref<<"\n\t vec3"<<vec3 <<std::endl;
|
||||
ref = ref - vec3;
|
||||
err = TensorRemove(innerProduct(ref,ref));
|
||||
std::cout <<"Zmul diff "<< Reduce(err)<<std::endl;
|
||||
|
||||
random(sRNG,mat);
|
||||
mat = zero;
|
||||
mat()()(0,0) = 1.0;
|
||||
random(sRNG,vec);
|
||||
|
||||
ref = mat*vec;
|
||||
|
||||
WilsonDslashAvx512((void *)&vec, (void *)&mat,(void *)&matvec);
|
||||
|
||||
//std::cout << ref <<std::endl;
|
||||
//std::cout << matvec<<std::endl;
|
||||
ref = ref - matvec;
|
||||
err = TensorRemove(innerProduct(ref,ref));
|
||||
std::cout <<"Double SU3 x 2spin diff "<< Reduce(err)<<std::endl;
|
||||
|
||||
vColourMatrixF matF;
|
||||
vHalfSpinColourVectorF vec1F;
|
||||
vHalfSpinColourVectorF vec2F;
|
||||
vHalfSpinColourVectorF vec3F;
|
||||
vHalfSpinColourVectorF vecF;
|
||||
vHalfSpinColourVectorF matvecF;
|
||||
vHalfSpinColourVectorF refF;
|
||||
vComplexF errF;
|
||||
|
||||
random(sRNG,matF);
|
||||
matF = zero;
|
||||
matF()()(0,0)=1.0;
|
||||
random(sRNG,vecF);
|
||||
|
||||
refF = matF*vecF;
|
||||
|
||||
WilsonDslashAvx512F((void *)&vecF, (void *)&matF,(void *)&matvecF);
|
||||
|
||||
//std::cout << refF <<std::endl;
|
||||
//std::cout << matvecF<<std::endl;
|
||||
|
||||
refF = refF-matvecF;
|
||||
errF = TensorRemove(innerProduct(refF,refF));
|
||||
std::cout <<"Single SU3 x 2spin diff "<< Reduce(errF)<<std::endl;
|
||||
|
||||
TimesIAvx512F((void *)&vecF,(void *)&matvecF);
|
||||
//std::cout << timesI(vecF)<<std::endl;
|
||||
//std::cout << matvecF<<std::endl;
|
||||
refF = timesI(vecF)-matvecF;
|
||||
errF = TensorRemove(innerProduct(refF,refF));
|
||||
std::cout <<" timesI single diff "<< Reduce(errF)<<std::endl;
|
||||
|
||||
TimesIAvx512((void *)&vec,(void *)&matvec);
|
||||
|
||||
//std::cout << timesI(vec)<<std::endl;
|
||||
//std::cout << matvec<<std::endl;
|
||||
|
||||
ref = timesI(vec)-matvec;
|
||||
err = TensorRemove(innerProduct(ref,ref));
|
||||
std::cout <<" timesI double diff "<< Reduce(err)<<std::endl;
|
||||
|
||||
TimesMinusIAvx512F((void *)&vecF,(void *)&matvecF);
|
||||
//std::cout << timesMinusI(vecF)<<std::endl;
|
||||
//std::cout << matvecF<<std::endl;
|
||||
refF = timesMinusI(vecF)-matvecF;
|
||||
errF = TensorRemove(innerProduct(refF,refF));
|
||||
std::cout <<" timesMinusI single diff "<< Reduce(errF)<<std::endl;
|
||||
|
||||
TimesMinusIAvx512((void *)&vec,(void *)&matvec);
|
||||
//std::cout << timesMinusI(vec)<<std::endl;
|
||||
//std::cout << matvec<<std::endl;
|
||||
|
||||
ref = timesMinusI(vec)-matvec;
|
||||
err = TensorRemove(innerProduct(ref,ref));
|
||||
std::cout <<" timesMinusI double diff "<< Reduce(err)<<std::endl;
|
||||
|
||||
|
||||
LatticeFermion src (FGrid);
|
||||
LatticeFermion tmp (FGrid);
|
||||
LatticeFermion srce(FrbGrid);
|
||||
|
||||
LatticeFermion resulto(FrbGrid); resulto=zero;
|
||||
@ -114,13 +175,14 @@ int main(int argc,char **argv)
|
||||
LatticeFermion diff(FrbGrid);
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
|
||||
#if 1
|
||||
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
random(RNG5,src);
|
||||
#if 1
|
||||
random(RNG4,Umu);
|
||||
#else
|
||||
int mmu=3;
|
||||
int mmu=2;
|
||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
@ -157,7 +219,7 @@ int main(int argc,char **argv)
|
||||
}
|
||||
t1=usecond();
|
||||
|
||||
|
||||
#if 1
|
||||
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||
Dw.DhopOE(srce,resulta,0);
|
||||
PerformanceCounter Counter(i);
|
||||
@ -166,50 +228,119 @@ int main(int argc,char **argv)
|
||||
Counter.Stop();
|
||||
Counter.Report();
|
||||
}
|
||||
resulta = (-0.5) * resulta;
|
||||
#endif
|
||||
//resulta = (-0.5) * resulta;
|
||||
|
||||
std::cout<<GridLogMessage << "Called Asm Dw"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(resulta)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops*ncall/(t1-t0)<<std::endl;
|
||||
diff = resulto-resulta;
|
||||
std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
|
||||
|
||||
std::cout<<std::endl;
|
||||
#if 0
|
||||
std::cout<<"=========== result Grid ============="<<std::endl;
|
||||
std::cout<<std::endl;
|
||||
tmp = zero;
|
||||
setCheckerboard(tmp,resulto);
|
||||
std::cout<<tmp<<std::endl;
|
||||
std::cout<<std::endl;
|
||||
std::cout<<"=========== result ASM ============="<<std::endl;
|
||||
std::cout<<std::endl;
|
||||
tmp = zero;
|
||||
setCheckerboard(tmp,resulta);
|
||||
std::cout<<tmp<<std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
|
||||
#define VZERO(A) VZEROd(A)
|
||||
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
|
||||
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
|
||||
#define VMUL(Uri,Uir,Chi,UChi,Z) VMULd(Uri,Uir,Chi,UChi,Z)
|
||||
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDd(Uri,Uir,Chi,UChi,Z)
|
||||
#define ZEND1(A,B,C) ZEND1d(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2d(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#include <simd/Intel512double.h>
|
||||
|
||||
#define zz Z0
|
||||
|
||||
|
||||
void Zmul(void *ptr1,void *ptr2,void *ptr3)
|
||||
{
|
||||
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
|
||||
__asm__ ("kmovw %%eax, %%k6 " : : :);
|
||||
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
|
||||
__asm__ ("kmovw %%eax, %%k7 " : : :);
|
||||
|
||||
#define CC result_00
|
||||
LOAD64(%r9,ptr1);
|
||||
LOAD64(%r8,ptr2);
|
||||
LOAD64(%r10,ptr3)
|
||||
__asm__ (
|
||||
VLOAD(0,%r8,CC)
|
||||
ZLOAD(0,%r9,Chi_00,Z0)
|
||||
ZMUL(Chi_00,Z0,CC,UChi_00,Z1)
|
||||
//VSTORE(0,%r10,UChi_00)
|
||||
//VSTORE(1,%r10,Z1)
|
||||
ZEND1(UChi_00,Z1,Z0)
|
||||
//VSTORE(2,%r10,UChi_00)
|
||||
ZEND2(UChi_00,Z1,Z0)
|
||||
//VSTORE(3,%r10,UChi_00)
|
||||
VSTORE(0,%r10,UChi_00)
|
||||
VLOAD(1,%r8,CC)
|
||||
ZLOAD(1,%r9,Chi_01,Z0)
|
||||
ZMUL(Chi_01,Z0,CC,UChi_01,Z1)
|
||||
ZEND1(UChi_01,Z1,Z0)
|
||||
ZEND2(UChi_01,Z1,Z0)
|
||||
VSTORE(1,%r10,UChi_01)
|
||||
VLOAD(2,%r8,CC)
|
||||
ZLOAD(2,%r9,Chi_02,Z0)
|
||||
ZMUL(Chi_02,Z0,CC,UChi_02,Z1)
|
||||
ZEND1(UChi_02,Z1,Z0)
|
||||
ZEND2(UChi_02,Z1,Z0)
|
||||
VSTORE(2,%r10,UChi_02)
|
||||
VLOAD(3,%r8,CC)
|
||||
ZLOAD(3,%r9,Chi_10,Z0)
|
||||
ZMUL(Chi_10,Z0,CC,UChi_10,Z1)
|
||||
ZEND1(UChi_10,Z1,Z0)
|
||||
ZEND2(UChi_10,Z1,Z0)
|
||||
VSTORE(3,%r10,UChi_10)
|
||||
VLOAD(4,%r8,CC)
|
||||
ZLOAD(4,%r9,Chi_11,Z0)
|
||||
ZMUL(Chi_11,Z0,CC,UChi_11,Z1)
|
||||
ZEND1(UChi_11,Z1,Z0)
|
||||
ZEND2(UChi_11,Z1,Z0)
|
||||
VSTORE(4,%r10,UChi_11)
|
||||
VLOAD(5,%r8,CC)
|
||||
ZLOAD(5,%r9,Chi_12,Z0)
|
||||
ZMUL(Chi_12,Z0,CC,UChi_12,Z1)
|
||||
ZEND1(UChi_12,Z1,Z0)
|
||||
ZEND2(UChi_12,Z1,Z0)
|
||||
VSTORE(5,%r10,UChi_12)
|
||||
);
|
||||
}
|
||||
void TimesMinusIAvx512(void *ptr1,void *ptr3)
|
||||
{
|
||||
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
|
||||
__asm__ ("kmovw %%eax, %%k6 " : : :);
|
||||
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
|
||||
__asm__ ("kmovw %%eax, %%k7 " : : :);
|
||||
|
||||
MASK_REGS;
|
||||
|
||||
LOAD_CHI(ptr1);
|
||||
|
||||
__asm__ (
|
||||
VZERO(zz)
|
||||
VTIMESMINUSI(Chi_00,UChi_00,zz)
|
||||
VTIMESMINUSI(Chi_01,UChi_01,zz)
|
||||
VTIMESMINUSI(Chi_02,UChi_02,zz)
|
||||
VTIMESMINUSI(Chi_10,UChi_10,zz)
|
||||
VTIMESMINUSI(Chi_11,UChi_11,zz)
|
||||
VTIMESMINUSI(Chi_12,UChi_12,zz)
|
||||
);
|
||||
|
||||
SAVE_UCHI(ptr3);
|
||||
}
|
||||
|
||||
void TimesIAvx512(void *ptr1,void *ptr3)
|
||||
{
|
||||
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
|
||||
__asm__ ("kmov %%eax, %%k6 " : : :);
|
||||
__asm__ ("knot %%k6, %%k7 " : : :);
|
||||
|
||||
__asm__ ("kmovw %%eax, %%k6 " : : :);
|
||||
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
|
||||
__asm__ ("kmovw %%eax, %%k7 " : : :);
|
||||
|
||||
MASK_REGS;
|
||||
|
||||
@ -252,41 +383,69 @@ void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)
|
||||
|
||||
}
|
||||
|
||||
#undef VLOAD
|
||||
#undef VSTORE
|
||||
#undef VMUL
|
||||
#undef VMADD
|
||||
#undef ZEND1
|
||||
#undef ZEND2
|
||||
#undef ZLOAD
|
||||
#undef ZMUL
|
||||
#undef ZMADD
|
||||
#undef VZERO
|
||||
#undef VTIMESI
|
||||
#undef VTIMESI0
|
||||
#undef VTIMESI1
|
||||
#undef VTIMESI2
|
||||
#undef VTIMESMINUSI
|
||||
#undef ZMULMEM2SP
|
||||
#undef ZMADDMEM2SP
|
||||
#define VZERO(A) VZEROf(A)
|
||||
#define VMOV(A,B) VMOVf(A,B)
|
||||
#define VADD(A,B,C) VADDf(A,B,C)
|
||||
#define VSUB(A,B,C) VSUBf(A,B,C)
|
||||
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
||||
#include <simd/Intel512single.h>
|
||||
|
||||
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
||||
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
||||
#define VMUL(Uri,Uir,Chi,UChi,Z) VMULf(Uri,Uir,Chi,UChi,Z)
|
||||
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
|
||||
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
||||
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
||||
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||
void ZmulF(void *ptr1,void *ptr2,void *ptr3)
|
||||
{
|
||||
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
|
||||
__asm__ ("kmovw %%eax, %%k6 " : : :);
|
||||
__asm__ ("mov $0x5555, %%eax " : : :"%eax");
|
||||
__asm__ ("kmovw %%eax, %%k7 " : : :);
|
||||
MASK_REGS;
|
||||
ZLOAD(0,ptr1,Chi_00,Z0);
|
||||
ZLOAD(1,ptr1,Chi_01,Z1);
|
||||
ZLOAD(2,ptr1,Chi_02,Z2);
|
||||
ZLOAD(3,ptr1,Chi_10,Z3);
|
||||
ZLOAD(4,ptr1,Chi_11,Z4);
|
||||
ZLOAD(5,ptr1,Chi_12,Z5);
|
||||
|
||||
VLOAD(0,ptr2,Chi_20);
|
||||
VLOAD(1,ptr2,Chi_21);
|
||||
VLOAD(2,ptr2,Chi_22);
|
||||
VLOAD(3,ptr2,Chi_30);
|
||||
VLOAD(4,ptr2,Chi_31);
|
||||
VLOAD(5,ptr2,Chi_32);
|
||||
|
||||
ZMUL(Chi_00,Z0,Chi_20,UChi_00,UChi_20);
|
||||
ZMUL(Chi_01,Z1,Chi_21,UChi_01,UChi_21);
|
||||
ZMUL(Chi_02,Z2,Chi_22,UChi_02,UChi_22);
|
||||
ZMUL(Chi_10,Z3,Chi_23,UChi_10,UChi_30);
|
||||
ZMUL(Chi_11,Z4,Chi_24,UChi_11,UChi_31);
|
||||
ZMUL(Chi_12,Z5,Chi_25,UChi_12,UChi_32);
|
||||
|
||||
ZEND1(UChi_00,UChi_20,Z0);
|
||||
ZEND1(UChi_01,UChi_21,Z1);
|
||||
ZEND1(UChi_02,UChi_22,Z2);
|
||||
ZEND1(UChi_10,UChi_30,Z3);
|
||||
ZEND1(UChi_11,UChi_31,Z4);
|
||||
ZEND1(UChi_12,UChi_32,Z5);
|
||||
|
||||
ZEND2(UChi_00,UChi_20,Z0);
|
||||
ZEND2(UChi_01,UChi_21,Z1);
|
||||
ZEND2(UChi_02,UChi_22,Z2);
|
||||
ZEND2(UChi_10,UChi_30,Z3);
|
||||
ZEND2(UChi_11,UChi_31,Z4);
|
||||
ZEND2(UChi_12,UChi_32,Z5);
|
||||
|
||||
SAVE_UCHI(ptr3);
|
||||
}
|
||||
|
||||
void TimesMinusIAvx512F(void *ptr1,void *ptr3)
|
||||
{
|
||||
MASK_REGS;
|
||||
|
||||
LOAD_CHI(ptr1);
|
||||
__asm__ (
|
||||
VZERO(zz)
|
||||
VTIMESMINUSI(Chi_00,UChi_00,zz)
|
||||
VTIMESMINUSI(Chi_01,UChi_01,zz)
|
||||
VTIMESMINUSI(Chi_02,UChi_02,zz)
|
||||
VTIMESMINUSI(Chi_10,UChi_10,zz)
|
||||
VTIMESMINUSI(Chi_11,UChi_11,zz)
|
||||
VTIMESMINUSI(Chi_12,UChi_12,zz)
|
||||
);
|
||||
SAVE_UCHI(ptr3);
|
||||
}
|
||||
|
||||
void TimesIAvx512F(void *ptr1,void *ptr3)
|
||||
{
|
||||
@ -311,7 +470,8 @@ void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)
|
||||
|
||||
LOAD_CHI(ptr1);
|
||||
|
||||
MULT_2SPIN(ptr2);
|
||||
MULT_ADDSUB_2SPIN(ptr2);
|
||||
//MULT_2SPIN(ptr2);
|
||||
|
||||
SAVE_UCHI(ptr3);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user