Merge branch 'develop' into feature/gpu-port

Conflicts: Grid/qcd/action/fermion/WilsonKernelsAsm.cc Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h benchmarks/Benchmark_comms.cc
2026-05-22 18:14:17 +01:00 · 2019-08-14 18:56:54 +01:00
parent 3e49dc8a67 bca36d9bc3
commit 48e6efc7c9
6 changed files with 159 additions and 22 deletions
@@ -0,0 +1,127 @@
 /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/qcd/action/fermion/FermionCore.h>
 namespace Grid {
 namespace QCD {
 ///////////////////////////////////////////////////////////
 // Default to no assembler implementation
 ///////////////////////////////////////////////////////////
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 template<class Impl> void 
 WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 #include <Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h>
 #include <Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h>
 #define INSTANTIATE_ASM(A)\
 template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 \
 template void WilsonKernels<A>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 template void WilsonKernels<A>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 \
 template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 template void WilsonKernels<A>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 \
 template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 INSTANTIATE_ASM(WilsonImplF);
 INSTANTIATE_ASM(WilsonImplD);
 INSTANTIATE_ASM(ZWilsonImplF);
 INSTANTIATE_ASM(ZWilsonImplD);
 INSTANTIATE_ASM(GparityWilsonImplF);
 INSTANTIATE_ASM(GparityWilsonImplD);
 INSTANTIATE_ASM(DomainWallVec5dImplF);
 INSTANTIATE_ASM(DomainWallVec5dImplD);
 INSTANTIATE_ASM(ZDomainWallVec5dImplF);
 INSTANTIATE_ASM(ZDomainWallVec5dImplD);
 INSTANTIATE_ASM(WilsonImplFH);
 INSTANTIATE_ASM(WilsonImplDF);
 INSTANTIATE_ASM(ZWilsonImplFH);
 INSTANTIATE_ASM(ZWilsonImplDF);
 INSTANTIATE_ASM(GparityWilsonImplFH);
 INSTANTIATE_ASM(GparityWilsonImplDF);
 INSTANTIATE_ASM(DomainWallVec5dImplFH);
 INSTANTIATE_ASM(DomainWallVec5dImplDF);
 INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
 INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
 }}
@@ -25,8 +25,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-			   /*  END LEGAL */
+/*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #pragma once 
@@ -28,11 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once
-#include <Grid.h>
+#include <Grid/Grid.h>
 #ifdef AVX512
-#include <simd/Intel512common.h>
+#include <Grid/simd/Intel512common.h>
-#include <simd/Intel512avx.h>
+#include <Grid/simd/Intel512avx.h>
 #endif
 // Interleave operations from two directions
@@ -681,7 +681,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge3 =(uint64_t)&UU[sU]( T ); 
  // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
@@ -735,7 +735,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
 }
-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
@@ -820,7 +820,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
  // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
@@ -889,7 +889,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 #endif
 }
-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>
 #pragma once
@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 using namespace std;
 using namespace Grid;
 ;
 struct time_statistics{
  double mean;
@@ -187,9 +186,9 @@ int main (int argc, char ** argv)
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int i=0;i<Nloop;i++){
      double start=usecond();
@@ -276,15 +275,22 @@ int main (int argc, char ** argv)
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
 <<<<<<< HEAD
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 =======
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
 	bzero((void *)xbuf[d],bytes);
 	bzero((void *)rbuf[d],bytes);
 >>>>>>> develop
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
@@ -373,15 +379,22 @@ int main (int argc, char ** argv)
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
 <<<<<<< HEAD
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 =======
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
 	bzero((void *)xbuf[d],bytes);
 	bzero((void *)rbuf[d],bytes);
 >>>>>>> develop
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
@@ -471,15 +484,13 @@ int main (int argc, char ** argv)
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
 	//	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
@@ -262,7 +262,6 @@ Set HEADER_SEARCH_PATHS to:
    $Grid/build$(CONFIGURATION)/Grid
    $Grid
    $Grid/Grid
 followed by (***the order is important***) the locations reported by `grid-config --cxxflags`, ignoring duplicates, e.g.:
@@ -272,7 +271,7 @@ followed by (***the order is important***) the locations reported by `grid-confi
 **Note: the easiest way to set this value is to put it all on one line, space separated, and edit the text to the right of `HEADER_SEARCH_PATHS`**, i.e.:
-    $Grid/build$(CONFIGURATION)/Grid $Grid $Grid/Grid $GridPre/openmpi/include $GridPkg/include $GridPre/lime/include
+    $Grid/build$(CONFIGURATION)/Grid $Grid $GridPre/openmpi/include $GridPkg/include $GridPre/lime/include
 #### LIBRARY_SEARCH_PATHS
@@ -298,7 +297,7 @@ The easiest way to link to all required libraries is to obtain a list of all lib
 and pasting the output ***with `-lGrid -lHadrons ` prepended*** (including the `-l` switches) directly into `OTHER_LDFLAGS`, e.g.:
-    -lGrid -lHadrons -lmpi -lhdf5_cpp -lz -lcrypto -llime -lfftw3f -lfftw3 -lmpfr -lgmp -lstdc++ -lm -lz -lhdf5
+    -lGrid -lHadrons -lmpi -lhdf5_cpp -lhdf5 -lz -lcrypto -llime -lfftw3f -lfftw3 -lmpfr -lgmp -lm
 ## Make additional configurations