Merge branch 'release/v0.6.0' into develop

2025-09-18 09:11:04 +01:00 · 2016-11-09 12:44:00 +00:00
parent 42c912f608 604f0ea2f6
commit 58f4950652
8 changed files with 93 additions and 20 deletions
--- a/1
+++ b/1
@@ -0,0 +1 @@
 README.md
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -126,6 +126,7 @@ namespace Grid {
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
    double USec(void)   {return (double)usec;}    
    FFT ( GridCartesian * grid ) :
    vgrid(grid),
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
 #ifdef __x86_64__
 #include <x86intrin.h>
 #endif
 namespace Grid {
@@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
   return tmp;
 }
 #elif defined __x86_64__
 #include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
  //  unsigned int dummy;
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -10,6 +10,7 @@
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -53,24 +54,26 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
 }
 #if defined(AVX512) 
 #include <simd/Intel512wilson.h>
    ///////////////////////////////////////////////////////////
    // If we are AVX512 specialise the single precision routine
    ///////////////////////////////////////////////////////////
 #include <simd/Intel512wilson.h>
 #include <simd/Intel512single.h>
-static Vector<vComplexF> signs;
+static Vector<vComplexF> signsF;
-  int setupSigns(void ){
+  template<typename vtype>    
-    Vector<vComplexF> bother(2);
+  int setupSigns(Vector<vtype>& signs ){
    Vector<vtype> bother(2);
    signs = bother;
    vrsign(signs[0]);
    visign(signs[1]);
    return 1;
  }
-  static int signInit = setupSigns();
+
  static int signInitF = setupSigns(signsF);
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
@@ -78,6 +81,8 @@ static Vector<vComplexF> signs;
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
 #define COMPLEX_TYPE vComplexF
 #define signs signsF
 #undef KERNEL_DAG
 template<> void 
@@ -98,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
-#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 #undef KERNEL_DAG
@@ -113,8 +118,71 @@ template<> void
 WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef COMPLEX_TYPE
 #undef signs
 #undef VMOVRDUP
 #undef MAYBEPERM
 #undef MULT_2SPIN
 #undef FX 
-#endif
+///////////////////////////////////////////////////////////
 // If we are AVX512 specialise the double precision routine
 ///////////////////////////////////////////////////////////
 #include <simd/Intel512double.h>
 static Vector<vComplexD> signsD;
 #define signs signsD
 static int signInitD = setupSigns(signsD);
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
 #define COMPLEX_TYPE vComplexD
 #undef KERNEL_DAG
 template<> void 
 WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #define KERNEL_DAG
 template<> void 
 WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef VMOVIDUP
 #undef VMOVRDUP
 #undef MAYBEPERM
 #undef MULT_2SPIN
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
 //#define VMOVIDUP(A,B,C)                                  VBCASTIDUPd(A,B,C)
 //#define VMOVRDUP(A,B,C)                                  VBCASTRDUPd(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 #undef KERNEL_DAG
 template<> void 
 WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #define KERNEL_DAG
 template<> void 
 WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #undef COMPLEX_TYPE
 #undef signs
 #undef VMOVRDUP
 #undef MAYBEPERM
 #undef MULT_2SPIN
 #undef FX 
 #endif //AVX512
 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -5,7 +5,9 @@
  const uint64_t plocal =(uint64_t) & in._odata[0];
  //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
+  //COMPLEX_TYPE is vComplexF of vComplexD depending 
  //on the chosen precision
  COMPLEX_TYPE *isigns = &signs[0];
  MASK_REGS;
  int nmax=U._grid->oSites();
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -382,7 +382,6 @@ namespace Optimization {
  // Some Template specialization
  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
 #ifndef __INTEL_COMPILER
 #warning "Slow reduction due to incomplete reduce intrinsics"
  //Complex float Reduce
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@@ -93,10 +93,10 @@ int main (int argc, char ** argv)
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;
-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
  SpinMatrixF Sp; 
  Sp = zero; Sp = Sp+cVol;