Merge branch 'release/v0.6.0' into develop

2025-12-03 12:54:41 +00:00 · 2016-11-09 12:44:00 +00:00
parent 42c912f608 604f0ea2f6
commit 58f4950652
8 changed files with 93 additions and 20 deletions
--- a/1
+++ b/1
@@ -0,0 +1 @@
+README.md
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -126,7 +126,8 @@ namespace Grid {
    
    double Flops(void) {return flops;}
    double MFlops(void) {return flops/usec;}
-    
+    double USec(void)   {return (double)usec;}    
+
    FFT ( GridCartesian * grid ) :
    vgrid(grid),
    Nd(grid->_ndimension),
--- a/lib/PerfCount.h
+++ b/lib/PerfCount.h
@@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
+#ifdef __x86_64__
+#include <x86intrin.h>
+#endif

 namespace Grid {

@@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
   return tmp;
 }
 #elif defined __x86_64__
-#include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
  //  unsigned int dummy;
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -97,7 +97,7 @@ void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
 int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;}
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -10,6 +10,7 @@

 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -53,24 +54,26 @@ WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,
 }

 #if defined(AVX512) 
-    
+#include <simd/Intel512wilson.h>
+
    ///////////////////////////////////////////////////////////
    // If we are AVX512 specialise the single precision routine
    ///////////////////////////////////////////////////////////
-    
-#include <simd/Intel512wilson.h>
+
 #include <simd/Intel512single.h>
    
-static Vector<vComplexF> signs;
-    
-  int setupSigns(void ){
-    Vector<vComplexF> bother(2);
+static Vector<vComplexF> signsF;
+
+  template<typename vtype>    
+  int setupSigns(Vector<vtype>& signs ){
+    Vector<vtype> bother(2);
    signs = bother;
    vrsign(signs[0]);
    visign(signs[1]);
    return 1;
  }
-  static int signInit = setupSigns();
+
+  static int signInitF = setupSigns(signsF);
  
 #define label(A)  ilabel(A)
 #define ilabel(A) ".globl\n"  #A ":\n" 
@@ -78,6 +81,8 @@ static Vector<vComplexF> signs;
 #define MAYBEPERM(A,perm) if (perm) { A ; }
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 #define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexF
+#define signs signsF
  
 #undef KERNEL_DAG
 template<> void 
@@ -98,8 +103,8 @@ WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder
 #undef FX 
 #define FX(A) DWFASM_ ## A
 #define MAYBEPERM(A,B) 
-#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
-#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 				    
 #undef KERNEL_DAG
@@ -113,8 +118,71 @@ template<> void
 WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+#undef COMPLEX_TYPE
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+	
+///////////////////////////////////////////////////////////
+// If we are AVX512 specialise the double precision routine
+///////////////////////////////////////////////////////////
+
+#include <simd/Intel512double.h>
+    
+static Vector<vComplexD> signsD;
+#define signs signsD
+static int signInitD = setupSigns(signsD);
+    
+#define MAYBEPERM(A,perm) if (perm) { A ; }
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
+#define FX(A) WILSONASM_ ##A
+#define COMPLEX_TYPE vComplexD
+  
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+      
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<WilsonImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 				    
-#endif
+#undef VMOVIDUP
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+#define FX(A) DWFASM_ ## A
+#define MAYBEPERM(A,B) 
+//#define VMOVIDUP(A,B,C)                                  VBCASTIDUPd(A,B,C)
+//#define VMOVRDUP(A,B,C)                                  VBCASTRDUPd(A,B,C)
+#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
+				    
+#undef KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
+							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+				    
+#define KERNEL_DAG
+template<> void 
+WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
+	
+#undef COMPLEX_TYPE
+#undef signs
+#undef VMOVRDUP
+#undef MAYBEPERM
+#undef MULT_2SPIN
+#undef FX 
+
+#endif //AVX512

 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
--- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
+++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h
@@ -5,7 +5,9 @@
  const uint64_t plocal =(uint64_t) & in._odata[0];

  //  vComplexF isigns[2] = { signs[0], signs[1] };
-  vComplexF *isigns = &signs[0];
+  //COMPLEX_TYPE is vComplexF of vComplexD depending 
+  //on the chosen precision
+  COMPLEX_TYPE *isigns = &signs[0];

  MASK_REGS;
  int nmax=U._grid->oSites();
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -382,7 +382,6 @@ namespace Optimization {
  // Some Template specialization

  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-
 #ifndef __INTEL_COMPILER
 #warning "Slow reduction due to incomplete reduce intrinsics"
  //Complex float Reduce
--- a/tests/core/Test_fftf.cc
+++ b/tests/core/Test_fftf.cc
@@ -93,10 +93,10 @@ int main (int argc, char ** argv)
  C=C-Ctilde;
  std::cout << "diff scalar "<<norm2(C) << std::endl;

-  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
-  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
+  theFFT.FFT_dim(Stilde,S,0,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,1,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,2,FFT::forward);  S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
+  theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;

  SpinMatrixF Sp; 
  Sp = zero; Sp = Sp+cVol;