1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Merge branch 'release/v0.6.0' into develop

This commit is contained in:
paboyle 2016-11-09 12:44:00 +00:00
commit 58f4950652
8 changed files with 93 additions and 20 deletions

1
README Symbolic link
View File

@ -0,0 +1 @@
README.md

View File

@ -126,7 +126,8 @@ namespace Grid {
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),

View File

@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#else
#include <sys/syscall.h>
#endif
#ifdef __x86_64__
#include <x86intrin.h>
#endif
namespace Grid {
@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
return tmp;
}
#elif defined __x86_64__
#include <x86intrin.h>
inline uint64_t cyclecount(void){
return __rdtsc();
// unsigned int dummy;

View File

@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ assert(0);}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
source =0;

View File

@ -10,6 +10,7 @@
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -53,24 +54,26 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
}
#if defined(AVX512)
#include <simd/Intel512wilson.h>
///////////////////////////////////////////////////////////
// If we are AVX512 specialise the single precision routine
///////////////////////////////////////////////////////////
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h>
static Vector<vComplexF> signs;
int setupSigns(void ){
Vector<vComplexF> bother(2);
static Vector<vComplexF> signsF;
template<typename vtype>
int setupSigns(Vector<vtype>& signs ){
Vector<vtype> bother(2);
signs = bother;
vrsign(signs[0]);
visign(signs[1]);
return 1;
}
static int signInit = setupSigns();
static int signInitF = setupSigns(signsF);
#define label(A) ilabel(A)
#define ilabel(A) ".globl\n" #A ":\n"
@ -78,6 +81,8 @@ static Vector<vComplexF> signs;
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define FX(A) WILSONASM_ ##A
#define COMPLEX_TYPE vComplexF
#define signs signsF
#undef KERNEL_DAG
template<> void
@ -98,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
#undef FX
#define FX(A) DWFASM_ ## A
#define MAYBEPERM(A,B)
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
//#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
//#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
#undef KERNEL_DAG
@ -113,8 +118,71 @@ template<> void
WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_TYPE
#undef signs
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#undef FX
///////////////////////////////////////////////////////////
// If we are AVX512 specialise the double precision routine
///////////////////////////////////////////////////////////
#include <simd/Intel512double.h>
static Vector<vComplexD> signsD;
#define signs signsD
static int signInitD = setupSigns(signsD);
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define FX(A) WILSONASM_ ##A
#define COMPLEX_TYPE vComplexD
#undef KERNEL_DAG
template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG
template<> void
WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#endif
#undef VMOVIDUP
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#undef FX
#define FX(A) DWFASM_ ## A
#define MAYBEPERM(A,B)
//#define VMOVIDUP(A,B,C) VBCASTIDUPd(A,B,C)
//#define VMOVRDUP(A,B,C) VBCASTRDUPd(A,B,C)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
#undef KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#define KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_TYPE
#undef signs
#undef VMOVRDUP
#undef MAYBEPERM
#undef MULT_2SPIN
#undef FX
#endif //AVX512
#define INSTANTIATE_ASM(A)\
template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\

View File

@ -5,7 +5,9 @@
const uint64_t plocal =(uint64_t) & in._odata[0];
// vComplexF isigns[2] = { signs[0], signs[1] };
vComplexF *isigns = &signs[0];
//COMPLEX_TYPE is vComplexF of vComplexD depending
//on the chosen precision
COMPLEX_TYPE *isigns = &signs[0];
MASK_REGS;
int nmax=U._grid->oSites();

View File

@ -382,7 +382,6 @@ namespace Optimization {
// Some Template specialization
// Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
#ifndef __INTEL_COMPILER
#warning "Slow reduction due to incomplete reduce intrinsics"
//Complex float Reduce

View File

@ -93,10 +93,10 @@ int main (int argc, char ** argv)
C=C-Ctilde;
std::cout << "diff scalar "<<norm2(C) << std::endl;
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
SpinMatrixF Sp;
Sp = zero; Sp = Sp+cVol;