Simplify Impl

2025-12-06 22:34:41 +00:00 · 2019-06-09 22:26:27 +01:00
parent d6c0e0756d
commit 36f06555a2
6 changed files with 33 additions and 44 deletions
--- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
+++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h
@@ -102,19 +102,6 @@ public:
 #endif
  }

-  static accelerator_inline void multLinkProp(SitePropagator &phi,
-					      const SiteDoubledGaugeField &U,
-					      const SitePropagator &chi,int mu) 
-  {
-    SiteGaugeLink UU;
-    for (int i = 0; i < Dimension; i++) {
-      for (int j = 0; j < Dimension; j++) {
-        vsplat(UU()()(i, j), U(mu)()(i, j));
-      }
-    }
-    mult(&phi(), &UU(), &chi());
-  }
-
  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
  {
    SiteScalarGaugeField  ScalarUmu;
--- a/Grid/qcd/action/fermion/GparityWilsonImpl.h
+++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h
@@ -73,6 +73,14 @@ public:
  // provide the multiply by link that is differentiated between Gparity (with
  // flavour index) and non-Gparity
  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi, 
+					  const SiteDoubledGaugeField &U,
+					  const _Spinor &chi, 
+					  int mu) 
+  {
+    assert(0);
+  } 
+  template<class _Spinor>
  static accelerator_inline void multLink(_Spinor &phi, 
 					  const SiteDoubledGaugeField &U,
 					  const _Spinor &chi, 
@@ -182,14 +190,6 @@ public:
    }
 #endif   
  }
-  // Fixme: Gparity prop * link
-  static accelerator_inline void multLinkProp(SitePropagator &phi, 
-					      const SiteDoubledGaugeField &U,
-					      const SitePropagator &chi, 
-					      int mu)
-  {
-    assert(0);
-  }

  template <class ref>
  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
--- a/Grid/qcd/action/fermion/WilsonImpl.h
+++ b/Grid/qcd/action/fermion/WilsonImpl.h
@@ -83,22 +83,23 @@ public:
  static accelerator_inline void multLink(_Spinor &phi,
 					  const SiteDoubledGaugeField &U,
 					  const _Spinor &chi,
-					  int mu,
-					  StencilEntry *SE,
-					  StencilView &St) 
+					  int mu) 
  {
    auto UU = coalescedRead(U(mu));
    mult(&phi(), &UU, &chi());
  }
-    
-  static accelerator_inline void multLinkProp(SitePropagator &phi,
+  template<class _Spinor>
+  static accelerator_inline void multLink(_Spinor &phi,
 					  const SiteDoubledGaugeField &U,
-					      const SitePropagator &chi,
-					      int mu) 
+					  const _Spinor &chi,
+					  int mu,
+					  StencilEntry *SE,
+					  StencilView &St) 
  {
-    mult(&phi(), &U(mu), &chi());
+    multLink(phi,U,chi,mu);
  }
    
+      
  template <class ref>
  static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) 
  {
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@@ -364,19 +364,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { HOST_CALL(GenericDhopSite); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSite);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite);     return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSite); printf(".");    return;}
 #endif
   } else if( interior ) {
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { HOST_CALL(GenericDhopSiteInt); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteInt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);     return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt); printf("-");    return;}
 #endif
   } else if( exterior ) { 
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { HOST_CALL(GenericDhopSiteExt); return;}
 #ifndef GRID_NVCC
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { HOST_CALL(HandDhopSiteExt);    return;}
-     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt);     return;}
+     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteExt); printf("+");    return;}
 #endif
   }
   assert(0 && " Kernel optimisation case not covered ");
@@ -440,7 +440,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(const SitePropagator &
  SitePropagator result, tmp;
  Gamma g5(Gamma::Algebra::Gamma5);

-  Impl::multLinkProp(tmp, U[sU], q_in_1, mu);
+  Impl::multLink(tmp, U[sU], q_in_1, mu);

  result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);

@@ -469,7 +469,7 @@ void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(const SitePropagator &
  SitePropagator result, tmp;
  Gamma g5(Gamma::Algebra::Gamma5);

-  Impl::multLinkProp(tmp, U[sU], q_in_1, mu + Nd);
+  Impl::multLink(tmp, U[sU], q_in_1, mu + Nd);

  result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
  if (switch_sign) {
@@ -496,7 +496,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
 {
  SitePropagator result;
  
-  Impl::multLinkProp(result, U[sU], q_in, mu);
+  Impl::multLink(result, U[sU], q_in, mu);
  result = WilsonCurrentFwd(result, mu);

  // Zero any unwanted timeslice entries.
@@ -525,7 +525,7 @@ void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
                                                     bool switch_sign)
 {
  SitePropagator result;
-  Impl::multLinkProp(result, U[sU], q_in, mu + Nd);
+  Impl::multLink(result, U[sU], q_in, mu + Nd);
  result = WilsonCurrentBwd(result, mu);

  // Zero any unwanted timeslice entries.
--- a/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationGparity.cc.master
+++ b/Grid/qcd/action/fermion/instantiation/WilsonKernelsInstantiationGparity.cc.master
@@ -66,6 +66,7 @@ void WilsonKernels<IMPLEMENTATION>::ContractConservedCurrentSiteBwd( const SiteP

 HAND_SPECIALISE_GPARITY(IMPLEMENTATION);

+
 template class WilsonKernels<IMPLEMENTATION>; 


--- a/14
+++ b/14
@@ -6,17 +6,20 @@ GPU branch code item work list
 * 0) Single GPU
 - 128 bit integer table load in GPU code.
 - coalescedRead <- threadIdx.x
+- Gianluca's changes to Cayley into gpu-port
+- GPU accelerate EOFA
 - Clean up PRAGMAS, and SIMT_loop
  thread_loop interface revisit.
  for_n
  for
+- Staggered kernels -> GPU coalesced loop
+- Staggered kernels inline for GPU -- DONE


-* 2) 5D terms
+* 2) 5D terms & Gianluca
  - Cayley coefficients -> GPU retention or prefetch
-  - Gianluca's changes to Cayley into gpu-port
-  - GPU accelerate EOFA
  - Mobius kernel fusion. -- Gianluca?
+  - Make GPU offload reductions optionally deterministic -- Gianluca

 * 3) Comms/NVlink
 - OpenMP tasks to run comms threads. 
@@ -30,12 +33,9 @@ GPU branch code item work list

 * 5) Misc

- SIMD dirs in stencil
 - Conserved current clean up.
 - multLinkProp eliminate
- Staggered kernels -> GPU coalesced loop
- Staggered kernels inline for GPU -- DONE
- Make GPU offload reductions optionally deterministic -- Gianluca
+- SIMD dirs in stencil
 
 7) Accelerate the cshift