Merge branch 'develop' into feature/gpu-port

Conflicts: Grid/qcd/action/fermion/WilsonKernelsAsm.cc Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h benchmarks/Benchmark_comms.cc
2026-05-30 14:04:18 +01:00 · 2019-08-14 18:56:54 +01:00
parent 3e49dc8a67 bca36d9bc3
commit 48e6efc7c9
6 changed files with 159 additions and 22 deletions
@@ -0,0 +1,127 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+
+
+    Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Guido Cossu <guido.cossu@ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+namespace Grid {
+namespace QCD {
+
+
+///////////////////////////////////////////////////////////
+// Default to no assembler implementation
+///////////////////////////////////////////////////////////
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+template<class Impl> void 
+WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
+					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
+{
+  assert(0);
+}
+
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmAvx512.h>
+#include <Grid/qcd/action/fermion/WilsonKernelsAsmQPX.h>
+
+#define INSTANTIATE_ASM(A)\
+template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+ \
+template void WilsonKernels<A>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+ \
+template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+template void WilsonKernels<A>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+ \
+template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
+                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
+
+INSTANTIATE_ASM(WilsonImplF);
+INSTANTIATE_ASM(WilsonImplD);
+INSTANTIATE_ASM(ZWilsonImplF);
+INSTANTIATE_ASM(ZWilsonImplD);
+INSTANTIATE_ASM(GparityWilsonImplF);
+INSTANTIATE_ASM(GparityWilsonImplD);
+INSTANTIATE_ASM(DomainWallVec5dImplF);
+INSTANTIATE_ASM(DomainWallVec5dImplD);
+INSTANTIATE_ASM(ZDomainWallVec5dImplF);
+INSTANTIATE_ASM(ZDomainWallVec5dImplD);
+
+INSTANTIATE_ASM(WilsonImplFH);
+INSTANTIATE_ASM(WilsonImplDF);
+INSTANTIATE_ASM(ZWilsonImplFH);
+INSTANTIATE_ASM(ZWilsonImplDF);
+INSTANTIATE_ASM(GparityWilsonImplFH);
+INSTANTIATE_ASM(GparityWilsonImplDF);
+INSTANTIATE_ASM(DomainWallVec5dImplFH);
+INSTANTIATE_ASM(DomainWallVec5dImplDF);
+INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
+INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
+
+}}
+
@@ -25,8 +25,8 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 See the full license in the file "LICENSE" in the top level distribution
 directory
 *************************************************************************************/
-			   /*  END LEGAL */
-#include <Grid.h>
+/*  END LEGAL */
+#include <Grid/Grid.h>

 #pragma once 

@@ -28,11 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #pragma once

-#include <Grid.h>
+#include <Grid/Grid.h>

 #ifdef AVX512
-#include <simd/Intel512common.h>
-#include <simd/Intel512avx.h>
+#include <Grid/simd/Intel512common.h>
+#include <Grid/simd/Intel512avx.h>
 #endif

 // Interleave operations from two directions
@@ -681,7 +681,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
  gauge3 =(uint64_t)&UU[sU]( T ); 
  
  // This is the single precision 5th direction vectorised kernel
-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
@@ -735,7 +735,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
   
 }

-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 								    DoubledGaugeFieldView &U,
 								    DoubledGaugeFieldView &UUU,
@@ -820,7 +820,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl

  // This is the single precision 5th direction vectorised kernel

-#include <simd/Intel512single.h>
+#include <Grid/simd/Intel512single.h>
 template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
@@ -889,7 +889,7 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
 #endif
 }

-#include <simd/Intel512double.h>
+#include <Grid/simd/Intel512double.h>
 template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, 
 							       DoubledGaugeFieldView &U,
 							       DoubledGaugeFieldView &UUU,
@@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-#include <Grid.h>
+#include <Grid/Grid.h>

 #pragma once

@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 using namespace std;
 using namespace Grid;
- ;

 struct time_statistics{
  double mean;
@@ -187,9 +186,9 @@ int main (int argc, char ** argv)
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
+      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

      for(int i=0;i<Nloop;i++){
      double start=usecond();
@@ -276,15 +275,22 @@ int main (int argc, char ** argv)
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
+      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
+<<<<<<< HEAD
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+=======
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	bzero((void *)xbuf[d],bytes);
+	bzero((void *)rbuf[d],bytes);
+>>>>>>> develop
      }

      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

      double dbytes;
      for(int i=0;i<Nloop;i++){
@@ -373,15 +379,22 @@ int main (int argc, char ** argv)
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
+      uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
+<<<<<<< HEAD
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	//	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+=======
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	bzero((void *)xbuf[d],bytes);
+	bzero((void *)rbuf[d],bytes);
+>>>>>>> develop
      }

      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
@@ -471,15 +484,13 @@ int main (int argc, char ** argv)
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
+      uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int d=0;d<8;d++){
-	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	//	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	//	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
      }

      int ncomm;
-      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
@@ -262,7 +262,6 @@ Set HEADER_SEARCH_PATHS to:

    $Grid/build$(CONFIGURATION)/Grid
    $Grid
-    $Grid/Grid

 followed by (***the order is important***) the locations reported by `grid-config --cxxflags`, ignoring duplicates, e.g.:

@@ -272,7 +271,7 @@ followed by (***the order is important***) the locations reported by `grid-confi

 **Note: the easiest way to set this value is to put it all on one line, space separated, and edit the text to the right of `HEADER_SEARCH_PATHS`**, i.e.:

-    $Grid/build$(CONFIGURATION)/Grid $Grid $Grid/Grid $GridPre/openmpi/include $GridPkg/include $GridPre/lime/include
+    $Grid/build$(CONFIGURATION)/Grid $Grid $GridPre/openmpi/include $GridPkg/include $GridPre/lime/include

 #### LIBRARY_SEARCH_PATHS

@@ -298,7 +297,7 @@ The easiest way to link to all required libraries is to obtain a list of all lib

 and pasting the output ***with `-lGrid -lHadrons ` prepended*** (including the `-l` switches) directly into `OTHER_LDFLAGS`, e.g.:

-    -lGrid -lHadrons -lmpi -lhdf5_cpp -lz -lcrypto -llime -lfftw3f -lfftw3 -lmpfr -lgmp -lstdc++ -lm -lz -lhdf5
+    -lGrid -lHadrons -lmpi -lhdf5_cpp -lhdf5 -lz -lcrypto -llime -lfftw3f -lfftw3 -lmpfr -lgmp -lm

 ## Make additional configurations