From 7b59ab5bd74f94ef1fb4fef1872119bea357d196 Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Mon, 3 Jun 2019 15:46:26 +0100
Subject: [PATCH] Compiling after reorganisation

---
 .../WilsonKernelsGpuImplementation.h          | 341 ++++++++++++++++++
 1 file changed, 341 insertions(+)
 create mode 100644 Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
new file mode 100644
index 00000000..f3bdb54f
--- /dev/null
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsGpuImplementation.h
@@ -0,0 +1,341 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc
+
+Copyright (C) 2018
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+
+#pragma once
+
+#include <Grid/qcd/action/fermion/FermionCore.h>
+
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////
+// Gpu implementation; thread loop is implicit ; move to header
+//////////////////////////////////////////////////////////////
+accelerator_inline void synchronise(void) 
+{
+#ifdef __CUDA_ARCH__
+  __syncthreads();
+#endif
+  return;
+}
+accelerator_inline int get_my_lanes(int Nsimd) 
+{
+#ifdef __CUDA_ARCH__
+  return 1;
+#else 
+  return Nsimd;
+#endif
+}
+accelerator_inline int get_my_lane_offset(int Nsimd) 
+{
+#ifdef __CUDA_ARCH__
+  return ( (threadIdx.x) % Nsimd);
+#else
+  return 0;
+#endif
+}
+
+accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
+{
+#ifdef __CUDA_ARCH__
+  static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); 
+  uint4 * mem_pun  = (uint4 *)mem; // force 128 bit loads
+  uint4 * chip_pun = (uint4 *)&chip;
+  * chip_pun = * mem_pun;
+#else 
+  chip = *mem;
+#endif
+  return;
+}
+
+#ifdef GPU_VEC
+#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
+  if (SE._is_local) {							\
+    int mask = Nsimd >> (ptype + 1);					\
+    int plane= SE._permute ? (lane ^ mask) : lane;			\
+    auto in_l = extractLane(plane,in[SE._offset+s]);			\
+    spProj(chi,in_l);							\
+  } else {								\
+    chi  = extractLane(lane,buf[SE._offset+s]);				\
+  }									\
+  synchronise();
+#else 
+#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
+  if (SE._is_local) {							\
+    auto in_t = in[SE._offset+s];					\
+    if (SE._permute) {							\
+      spProj(tmp, in_t);						\
+      permute(chi, tmp, ptype);						\
+    } else {								\
+      spProj(chi, in_t);						\
+    }									\
+  } else {								\
+    chi  = buf[SE._offset+s];						\
+  }									\
+  synchronise();
+#endif
+
+template <class Impl>
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, SiteDoubledGaugeField &U,
+							    SiteHalfSpinor *buf, int Ls, int s,
+							    int sU, const FermionFieldView &in, FermionFieldView &out)
+{
+  typename SiteHalfSpinor::scalar_object chi;
+  typename SiteHalfSpinor::scalar_object Uchi;
+  typename SiteSpinor::scalar_object   result;
+  typedef typename SiteSpinor::scalar_type scalar_type;
+  typedef typename SiteSpinor::vector_type vector_type;
+  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
+
+  uint64_t lane_offset= get_my_lane_offset(Nsimd);
+  uint64_t lanes      = get_my_lanes(Nsimd);
+
+  StencilEntry *SE_mem;
+  StencilEntry SE; 
+
+  int ptype;
+  uint64_t ssF = Ls * sU;
+  uint64_t sF  = ssF + s;
+#ifndef __CUDA_ARCH__
+  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
+#else
+  int lane = lane_offset; {
+#endif
+    SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
+    spReconXp(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
+    accumReconYp(result, Uchi);
+      
+    SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
+    accumReconZp(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
+    accumReconTp(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
+    accumReconXm(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
+    accumReconYm(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
+    accumReconZm(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
+    accumReconTm(result, Uchi);
+#ifdef GPU_VEC
+  insertLane (lane,out[sF],result);
+#else
+  vstream(out[sF], result);
+#endif
+  }
+}
+
+template <class Impl>
+accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
+							 SiteHalfSpinor *buf,  int Ls, int s,
+							 int sU, const FermionFieldView &in, FermionFieldView &out) 
+{
+  typename SiteHalfSpinor::scalar_object chi;
+  typename SiteHalfSpinor::scalar_object Uchi;
+  typename SiteSpinor::scalar_object   result;
+  typedef typename SiteSpinor::scalar_type scalar_type;
+  typedef typename SiteSpinor::vector_type vector_type;
+  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);
+
+  uint64_t lane_offset= get_my_lane_offset(Nsimd);
+  uint64_t lanes      = get_my_lanes(Nsimd);
+
+  StencilEntry *SE_mem;
+  StencilEntry SE;
+  int ptype;
+  // Forces some degree of coalesce on the table look ups
+  // Could also use wide load instructions on the data structure
+  uint64_t ssF = Ls * sU;
+  uint64_t sF  = ssF + s;
+
+#ifndef __CUDA_ARCH__
+  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
+#else
+  int lane = lane_offset; {
+#endif
+    SE_mem = st.GetEntry(ptype, Xp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
+    spReconXm(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Yp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
+    accumReconYm(result, Uchi);
+      
+    SE_mem = st.GetEntry(ptype, Zp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
+    accumReconZm(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Tp, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
+    accumReconTm(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Xm, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
+    accumReconXp(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Ym, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
+    accumReconYp(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Zm, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
+    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
+    accumReconZp(result, Uchi);
+
+    SE_mem = st.GetEntry(ptype, Tm, ssF); get_stencil(SE_mem,SE);
+    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
+    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
+    accumReconTp(result, Uchi);
+
+#ifdef GPU_VEC
+    insertLane (lane,out[sF],result);
+#else
+  vstream(out[sF], result);
+#endif
+  }
+};
+
+// Template specialise Gparity to empty for now
+#define GPU_EMPTY(A)							\
+template <>								\
+accelerator_inline void							\
+WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
+			      SiteDoubledGaugeField &U,			\
+			      SiteHalfSpinor *buf, int Ls, int sF,	\
+			      int sU,					\
+			      const FermionFieldView &in,		\
+			      FermionFieldView &out) { assert(0);};	\
+template <>								\
+accelerator_inline void							\
+WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
+				 SiteDoubledGaugeField &U,		\
+				 SiteHalfSpinor *buf, int Ls,int sF,	\
+				 int sU,				\
+				 const FermionFieldView &in,		\
+				 FermionFieldView &out) { assert(0);};
+
+GPU_EMPTY(GparityWilsonImplF);
+GPU_EMPTY(GparityWilsonImplFH);
+GPU_EMPTY(GparityWilsonImplD);
+GPU_EMPTY(GparityWilsonImplDF);
+
+template <class Impl>
+void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+				     int Ls, int Nsite, const FermionField &in, FermionField &out,
+				     int interior,int exterior) 
+{
+    auto U_v   =   U.View();
+    auto in_v  =  in.View();
+    auto out_v = out.View();
+    auto st_v  =  st.View();
+    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
+      const uint64_t nsimd = Simd::Nsimd();
+      const uint64_t    NN = Nsite*Ls*nsimd;
+      accelerator_loopN( sss, NN, {
+	  uint64_t cur  = sss;
+	  //	  uint64_t lane = cur % nsimd;
+	  cur = cur / nsimd;
+	  uint64_t   s  = cur%Ls;
+	  //	  uint64_t   sF = cur;         
+	  cur = cur / Ls;
+	  uint64_t   sU = cur;
+	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
+      });
+    } else { 
+      /*
+      accelerator_loop( ss, U_v, {
+	int sU = ss;
+        int sF = Ls * sU;
+        WilsonKernels<Impl>::DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
+      });
+      */
+    }
+  }
+  template <class Impl>
+  void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
+					  int Ls, int Nsite, const FermionField &in, FermionField &out,
+					  int interior,int exterior) 
+  {
+    auto U_v   = U.View();
+    auto in_v  = in.View();
+    auto out_v = out.View();
+    auto st_v  = st.View();
+
+    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
+      const uint64_t nsimd = Simd::Nsimd();
+      const uint64_t    NN = Nsite*Ls*nsimd;
+      accelerator_loopN( sss, NN, {
+	  uint64_t cur  = sss;
+	  //	  uint64_t lane = cur % nsimd;
+	  cur = cur / nsimd;
+	  uint64_t   s  = cur%Ls;
+	  //	  uint64_t   sF = cur;         
+	  cur = cur / Ls;
+	  uint64_t   sU = cur;
+	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
+      });
+    } else { 
+      //      accelerator_loop( ss, U_v, {
+      //	int sU = ss;
+      //        int sF = Ls * sU;
+      //        WilsonKernels<Impl>::DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
+      //      });
+    }
+  }
+
+NAMESPACE_END(Grid);
+