Merge branch 'master' of https://github.com/paboyle/Grid into scidac1_2

Conflicts: lib/qcd/action/fermion/WilsonKernels.h
2026-05-24 11:04:17 +01:00 · 2016-01-11 11:36:45 -05:00
parent 5924e5a562 c99d748da6
commit 5c57d4f403
268 changed files with 12202 additions and 1707 deletions
@@ -1,3 +1,34 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/qcd/action/fermion/WilsonFermion5D.cc
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
 #include <Grid.h>
 #include <PerfCount.h>

@@ -71,6 +102,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,

  ImportGauge(_Umu);
  commtime=0;
+  jointime=0;
  dslashtime=0;
 }  
 template<class Impl>
@@ -206,6 +238,7 @@ void WilsonFermion5D<Impl>::Report(void)
  std::cout<<GridLogMessage << "********************"<<std::endl;
  std::cout<<GridLogMessage << "Halo   time "<<commtime <<" us"<<std::endl;
  std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
+  std::cout<<GridLogMessage << "join   time "<<jointime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "Stencil All    time "<<Stencil.halotime<<" us"<<std::endl;
  std::cout<<GridLogMessage << "********************"<<std::endl;
  std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
@@ -241,6 +274,18 @@ template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
+{
+  if ( Impl::overlapCommsCompute () ) { 
+    DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
+  } else { 
+    DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
+  }
+}
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
+					 DoubledGaugeField & U,
+					 const FermionField &in, FermionField &out,int dag)
 {
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));

@@ -254,8 +299,12 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
  int nwork = U._grid->oSites();
  
  commtime -=usecond();
-  st.HaloExchange(in,comm_buf,compressor);
+  std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
  commtime +=usecond();
+
+  jointime -=usecond();
+  thr.join();
+  jointime +=usecond();
  
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  // Not loop ordering and data layout.
@@ -365,6 +414,137 @@ PARALLEL_FOR_LOOP
  }
  dslashtime +=usecond();
 }
+
+template<class Impl>
+void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
+						     DoubledGaugeField & U,
+						     const FermionField &in, FermionField &out,int dag)
+{
+  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
+
+  Compressor compressor(dag);
+
+  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
+
+  int threads = GridThread::GetThreads();
+  int HT      = GridThread::GetHyperThreads();
+  int cores   = GridThread::GetCores();
+  int nwork = U._grid->oSites();
+  
+  commtime -=usecond();
+  std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor);
+  commtime +=usecond();
+  
+  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
+  // Not loop ordering and data layout.
+  // Designed to create 
+  // - per thread reuse in L1 cache for U
+  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
+  bool local    = true;
+  bool nonlocal = false;
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  }
+      }
+    } else { 
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=ss;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->HandOptDslash ) {
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	}
+      }
+    } else { 
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	}
+      }
+    }
+  }
+  dslashtime +=usecond();
+
+  jointime -=usecond();
+  thr.join();
+  jointime +=usecond();
+
+  local    = false;
+  nonlocal = true;
+  dslashtime -=usecond();
+  if ( dag == DaggerYes ) {
+    if( this->HandOptDslash ) {
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  }
+      }
+    } else { 
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	{
+	  int sd;
+	  for(sd=0;sd<Ls;sd++){
+	    int sU=ss;
+	    int sF = sd+Ls*sU;
+	    Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	  }
+	}
+      }
+    }
+  } else {
+    if( this->HandOptDslash ) {
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU;
+	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	}
+      }
+    } else { 
+PARALLEL_FOR_LOOP
+      for(int ss=0;ss<U._grid->oSites();ss++){
+	int sU=ss;
+	for(int s=0;s<Ls;s++){
+	  int sF = s+Ls*sU; 
+	  Kernels::DiracOptDhopSite(st,U,comm_buf,sF,sU,in,out,local,nonlocal);
+	}
+      }
+    }
+  }
+  dslashtime +=usecond();
+
+
+}
+
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {