Grid/lib/qcd/action/fermion/WilsonKernelsGpu.cc

/*************************************************************************************

Grid physics library, www.github.com/paboyle/Grid

Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc

Copyright (C) 2018

Author: Peter Boyle <paboyle@ph.ed.ac.uk>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/*  END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>

NAMESPACE_BEGIN(Grid);

//////////////////////////////////////////////////////////////
// Gpu implementation; thread loop is implicit ; move to header
//////////////////////////////////////////////////////////////
accelerator_inline void synchronise(void) 
{
#ifdef __CUDA_ARCH__
  __syncthreads();
#endif
  return;
}
accelerator_inline int get_my_lanes(int Nsimd) 
{
#ifdef __CUDA_ARCH__
  return 1;
#else 
  return Nsimd;
#endif
}
accelerator_inline int get_my_lane_offset(int Nsimd) 
{
#ifdef __CUDA_ARCH__
  return ( (threadIdx.x) % Nsimd);
#else
  return 0;
#endif
}

#ifdef GPU_VEC
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  synchronise();							\
  if (SE->_is_local) {							\
    int mask = Nsimd >> (ptype + 1);					\
    int plane= SE->_permute ? (lane ^ mask) : lane;			\
    auto in_l = extractLane(plane,in[SE->_offset+s]);			\
    spProj(chi,in_l);							\
  } else {								\
    chi  = extractLane(lane,buf[SE->_offset+s]);			\
  }									\
  synchronise();
#else 
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  if (SE->_is_local) {							\
    auto in_t = in[SE->_offset+s];					\
    if (SE->_permute) {							\
      spProj(tmp, in_t);						\
      permute(chi, tmp, ptype);						\
    } else {								\
      spProj(chi, in_t);						\
    }									\
  } else {								\
    chi  = buf[SE->_offset+s];						\
  }									\
  synchronise();
#endif

template <class Impl>
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
						     SiteHalfSpinor *buf, int Ls, int s,
						     int sU, const FermionFieldView &in, FermionFieldView &out)
{
#ifdef GPU_VEC
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
#else 
  SiteHalfSpinor chi;
  SiteHalfSpinor Uchi;
  SiteHalfSpinor tmp;
  SiteSpinor   result;
#endif
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);

  uint64_t lane_offset= get_my_lane_offset(Nsimd);
  uint64_t lanes      = get_my_lanes(Nsimd);

  StencilEntry *SE;
  int ptype;
  uint64_t ssF = Ls * sU;
  uint64_t sF  = ssF + s;
#ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
  int lane = lane_offset; {
#endif
    SE = st.GetEntry(ptype, Xp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
    spReconXp(result, Uchi);

    SE = st.GetEntry(ptype, Yp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
    accumReconYp(result, Uchi);
      
    SE = st.GetEntry(ptype, Zp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
    accumReconZp(result, Uchi);

    SE = st.GetEntry(ptype, Tp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
    accumReconTp(result, Uchi);

    SE = st.GetEntry(ptype, Xm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
    accumReconXm(result, Uchi);

    SE = st.GetEntry(ptype, Ym, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
    accumReconYm(result, Uchi);


    SE = st.GetEntry(ptype, Zm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
    accumReconZm(result, Uchi);

    SE = st.GetEntry(ptype, Tm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
    accumReconTm(result, Uchi);

    synchronise();
#ifdef GPU_VEC
    insertLane (lane,out[sF],result);
#else
  vstream(out[sF], result);
#endif
  }
}

template <class Impl>
accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,
						  SiteHalfSpinor *buf,  int Ls, int s,
						  int sU, const FermionFieldView &in, FermionFieldView &out) 
{
#ifdef GPU_VEC
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
#else 
  SiteHalfSpinor chi;
  SiteHalfSpinor Uchi;
  SiteHalfSpinor tmp;
  SiteSpinor   result;
#endif
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;
  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);

  uint64_t lane_offset= get_my_lane_offset(Nsimd);
  uint64_t lanes      = get_my_lanes(Nsimd);

  StencilEntry *SE;
  int ptype;
  // Forces some degree of coalesce on the table look ups
  // Could also use wide load instructions on the data structure
  uint64_t ssF = Ls * sU;
  uint64_t sF  = ssF + s;

#ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
  int lane = lane_offset; {
#endif
    SE = st.GetEntry(ptype, Xp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
    Impl::multLinkGpu(lane,Uchi,U,chi,Xp);
    spReconXm(result, Uchi);

    SE = st.GetEntry(ptype, Yp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
    Impl::multLinkGpu(lane,Uchi,U,chi,Yp);
    accumReconYm(result, Uchi);
      
    SE = st.GetEntry(ptype, Zp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
    Impl::multLinkGpu(lane,Uchi,U,chi,Zp);
    accumReconZm(result, Uchi);

    SE = st.GetEntry(ptype, Tp, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
    Impl::multLinkGpu(lane,Uchi,U,chi,Tp);
    accumReconTm(result, Uchi);

    SE = st.GetEntry(ptype, Xm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
    Impl::multLinkGpu(lane,Uchi,U,chi,Xm);
    accumReconXp(result, Uchi);

    SE = st.GetEntry(ptype, Ym, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
    Impl::multLinkGpu(lane,Uchi,U,chi,Ym);
    accumReconYp(result, Uchi);

    SE = st.GetEntry(ptype, Zm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
    Impl::multLinkGpu(lane,Uchi,U,chi,Zm);
    accumReconZp(result, Uchi);

    SE = st.GetEntry(ptype, Tm, ssF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
    Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
    accumReconTp(result, Uchi);

    synchronise();
#ifdef GPU_VEC
    insertLane (lane,out[sF],result);
#else
  vstream(out[sF], result);
#endif
  }

};

// Template specialise Gparity to empty for now
#define GPU_EMPTY(A)							\
  template <>								\
accelerator_inline void							\
WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
			      SiteDoubledGaugeField &U,			\
			      SiteHalfSpinor *buf, int Ls, int sF,	\
			      int sU,					\
			      const FermionFieldView &in,		\
			      FermionFieldView &out) { assert(0);};	\
  template <>								\
  accelerator_inline void							\
  WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
				   DoubledGaugeFieldView &U,		\
				   SiteHalfSpinor *buf, int Ls,int sF,	\
				   int sU,				\
				   const FermionFieldView &in,		\
				   FermionFieldView &out) { assert(0);};

GPU_EMPTY(GparityWilsonImplF);
GPU_EMPTY(GparityWilsonImplFH);
GPU_EMPTY(GparityWilsonImplD);
GPU_EMPTY(GparityWilsonImplDF);

template <class Impl>
void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
			       int Ls, int Nsite, const FermionField &in, FermionField &out,
			       int interior,int exterior) 
{
    auto U_v   = U.View();
    auto in_v  = in.View();
    auto out_v = out.View();
    auto st_v  = st.View();
    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
      const uint64_t nsimd = Simd::Nsimd();
      const uint64_t    NN = Nsite*Ls*nsimd;
      accelerator_loopN( sss, NN, {
	  uint64_t cur  = sss;
	  //	  uint64_t lane = cur % nsimd;
	  cur = cur / nsimd;
	  uint64_t   s  = cur%Ls;
	  uint64_t   sF = cur;         cur = cur / Ls;
	  uint64_t   sU = cur;
	  WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);
      });
    } else { 
      accelerator_loop( ss, U_v, {
	int sU = ss;
        int sF = Ls * sU;
        DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
      });
    }
  }
  template <class Impl>
  void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf,
				    int Ls, int Nsite, const FermionField &in, FermionField &out,
				    int interior,int exterior) 
  {
    auto U_v   = U.View();
    auto in_v  = in.View();
    auto out_v = out.View();
    auto st_v  = st.View();

    if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) { 
      const uint64_t nsimd = Simd::Nsimd();
      const uint64_t    NN = Nsite*Ls*nsimd;
      accelerator_loopN( sss, NN, {
	  uint64_t cur  = sss;
	  // uint64_t lane = cur % nsimd;
	  cur = cur / nsimd;
	  uint64_t   s  = cur%Ls;
	  uint64_t   sF = cur;         cur = cur / Ls;
	  uint64_t   sU = cur;
	  WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);
      });
    } else { 
      accelerator_loop( ss, U_v, {
	int sU = ss;
        int sF = Ls * sU;
        DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);
      });
    }
  }


/*
GPU_EMPTY(DomainWallVec5dImplF);
GPU_EMPTY(DomainWallVec5dImplFH);
GPU_EMPTY(DomainWallVec5dImplD);
GPU_EMPTY(DomainWallVec5dImplDF);
GPU_EMPTY(ZDomainWallVec5dImplF);
GPU_EMPTY(ZDomainWallVec5dImplFH);
GPU_EMPTY(ZDomainWallVec5dImplD);
GPU_EMPTY(ZDomainWallVec5dImplDF);
*/

FermOpTemplateInstantiate(WilsonKernels);
AdjointFermOpTemplateInstantiate(WilsonKernels);
TwoIndexFermOpTemplateInstantiate(WilsonKernels);

NAMESPACE_END(Grid);
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc`

			`Copyright (C) 2018`

			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution`
			`directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
			`#include <Grid/qcd/action/fermion/FermionCore.h>`

			`NAMESPACE_BEGIN(Grid);`

			`//////////////////////////////////////////////////////////////`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`// Gpu implementation; thread loop is implicit ; move to header`
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`//////////////////////////////////////////////////////////////`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`accelerator_inline void synchronise(void)`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`{`
			`#ifdef __CUDA_ARCH__`
			`__syncthreads();`
			`#endif`
			`return;`
			`}`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`accelerator_inline int get_my_lanes(int Nsimd)`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`{`
			`#ifdef __CUDA_ARCH__`
			`return 1;`
			`#else`
			`return Nsimd;`
			`#endif`
			`}`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`accelerator_inline int get_my_lane_offset(int Nsimd)`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`{`
			`#ifdef __CUDA_ARCH__`
			`return ( (threadIdx.x) % Nsimd);`
			`#else`
			`return 0;`
			`#endif`
			`}`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#ifdef GPU_VEC`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`synchronise(); \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`if (SE->_is_local) { \`
			`int mask = Nsimd >> (ptype + 1); \`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`int plane= SE->_permute ? (lane ^ mask) : lane; \`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`auto in_l = extractLane(plane,in[SE->_offset+s]); \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`spProj(chi,in_l); \`
			`} else { \`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`chi = extractLane(lane,buf[SE->_offset+s]); \`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`} \`
			`synchronise();`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#else`
			`#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \`
			`if (SE->_is_local) { \`
			`auto in_t = in[SE->_offset+s]; \`
			`if (SE->_permute) { \`
			`spProj(tmp, in_t); \`
			`permute(chi, tmp, ptype); \`
			`} else { \`
			`spProj(chi, in_t); \`
			`} \`
			`} else { \`
			`chi = buf[SE->_offset+s]; \`
			`} \`
			`synchronise();`
			`#endif`
GPU dslash updates 2018-06-27 22:32:21 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`template <class Impl>`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,`
			`SiteHalfSpinor *buf, int Ls, int s,`
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`int sU, const FermionFieldView &in, FermionFieldView &out)`
			`{`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#ifdef GPU_VEC`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`typename SiteHalfSpinor::scalar_object chi;`
			`typename SiteHalfSpinor::scalar_object Uchi;`
			`typename SiteSpinor::scalar_object result;`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#else`
			`SiteHalfSpinor chi;`
			`SiteHalfSpinor Uchi;`
			`SiteHalfSpinor tmp;`
			`SiteSpinor result;`
			`#endif`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`typedef typename SiteSpinor::scalar_type scalar_type;`
			`typedef typename SiteSpinor::vector_type vector_type;`
			`constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);`

			`uint64_t lane_offset= get_my_lane_offset(Nsimd);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`uint64_t lanes = get_my_lanes(Nsimd);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`StencilEntry *SE;`
			`int ptype;`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`uint64_t ssF = Ls * sU;`
			`uint64_t sF = ssF + s;`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#ifndef __CUDA_ARCH__`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`for(int lane = lane_offset;lane<lane_offset+lanes;lane++){`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#else`
			`int lane = lane_offset; {`
			`#endif`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Xp, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);`
			`spReconXp(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Yp, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);`
			`accumReconYp(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Zp, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);`
			`accumReconZp(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Tp, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);`
			`accumReconTp(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Xm, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);`
			`accumReconXm(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Ym, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);`
			`accumReconYm(result, Uchi);`


Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Zm, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);`
			`accumReconZm(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Tm, ssF);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);`
			`accumReconTm(result, Uchi);`

			`synchronise();`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#ifdef GPU_VEC`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`insertLane (lane,out[sF],result);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#else`
			`vstream(out[sF], result);`
			`#endif`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`}`
			`}`
Start of GPU kernels 2018-05-15 00:41:17 +01:00
			`template <class Impl>`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDoubledGaugeField &U,`
			`SiteHalfSpinor *buf, int Ls, int s,`
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`int sU, const FermionFieldView &in, FermionFieldView &out)`
			`{`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#ifdef GPU_VEC`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`typename SiteHalfSpinor::scalar_object chi;`
			`typename SiteHalfSpinor::scalar_object Uchi;`
			`typename SiteSpinor::scalar_object result;`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#else`
			`SiteHalfSpinor chi;`
			`SiteHalfSpinor Uchi;`
			`SiteHalfSpinor tmp;`
			`SiteSpinor result;`
			`#endif`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`typedef typename SiteSpinor::scalar_type scalar_type;`
			`typedef typename SiteSpinor::vector_type vector_type;`
			`constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);`

			`uint64_t lane_offset= get_my_lane_offset(Nsimd);`
			`uint64_t lanes = get_my_lanes(Nsimd);`

Start of GPU kernels 2018-05-15 00:41:17 +01:00			`StencilEntry *SE;`
			`int ptype;`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`// Forces some degree of coalesce on the table look ups`
			`// Could also use wide load instructions on the data structure`
			`uint64_t ssF = Ls * sU;`
			`uint64_t sF = ssF + s;`
GPU dslash updates 2018-06-27 22:32:21 +01:00
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#ifndef __CUDA_ARCH__`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`for(int lane = lane_offset;lane<lane_offset+lanes;lane++){`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#else`
			`int lane = lane_offset; {`
			`#endif`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Xp, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Xp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`spReconXm(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Yp, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Yp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconYm(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Zp, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Zp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconZm(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Tp, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Tp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconTm(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Xm, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Xm);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconXp(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Ym, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Ym);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconYp(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Zm, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Zm);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconZp(result, Uchi);`

Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SE = st.GetEntry(ptype, Tm, ssF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`Impl::multLinkGpu(lane,Uchi,U,chi,Tm);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconTp(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`synchronise();`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#ifdef GPU_VEC`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`insertLane (lane,out[sF],result);`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`#else`
			`vstream(out[sF], result);`
			`#endif`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`}`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`};`

Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`// Template specialise Gparity to empty for now`
			`#define GPU_EMPTY(A) \`
			`template <> \`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`accelerator_inline void \`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`WilsonKernels<A>::GpuDhopSite(StencilView &st, \`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`SiteDoubledGaugeField &U, \`
			`SiteHalfSpinor *buf, int Ls, int sF, \`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`int sU, \`
			`const FermionFieldView &in, \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`FermionFieldView &out) { assert(0);}; \`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`template <> \`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`accelerator_inline void \`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \`
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`DoubledGaugeFieldView &U, \`
			`SiteHalfSpinor *buf, int Ls,int sF, \`
			`int sU, \`
			`const FermionFieldView &in, \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`FermionFieldView &out) { assert(0);};`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
			`GPU_EMPTY(GparityWilsonImplF);`
			`GPU_EMPTY(GparityWilsonImplFH);`
			`GPU_EMPTY(GparityWilsonImplD);`
			`GPU_EMPTY(GparityWilsonImplDF);`
GPU dslash updates 2018-06-27 22:32:21 +01:00
Finally starting to get decent performance on Volta 2018-07-13 17:06:18 +01:00			`template <class Impl>`
			`void WilsonKernels<Impl>::Dhop(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,`
			`int Ls, int Nsite, const FermionField &in, FermionField &out,`
			`int interior,int exterior)`
			`{`
			`auto U_v = U.View();`
			`auto in_v = in.View();`
			`auto out_v = out.View();`
			`auto st_v = st.View();`
			`if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {`
			`const uint64_t nsimd = Simd::Nsimd();`
			`const uint64_t NN = NsiteLsnsimd;`
			`accelerator_loopN( sss, NN, {`
			`uint64_t cur = sss;`
			`// uint64_t lane = cur % nsimd;`
			`cur = cur / nsimd;`
			`uint64_t s = cur%Ls;`
			`uint64_t sF = cur; cur = cur / Ls;`
			`uint64_t sU = cur;`
			`WilsonKernels<Impl>::GpuDhopSite(st_v,U_v[sU],buf,Ls,s,sU,in_v,out_v);`
			`});`
			`} else {`
			`accelerator_loop( ss, U_v, {`
			`int sU = ss;`
			`int sF = Ls * sU;`
			`DhopSite(Opt,st_v,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);`
			`});`
			`}`
			`}`
			`template <class Impl>`
			`void WilsonKernels<Impl>::DhopDag(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,`
			`int Ls, int Nsite, const FermionField &in, FermionField &out,`
			`int interior,int exterior)`
			`{`
			`auto U_v = U.View();`
			`auto in_v = in.View();`
			`auto out_v = out.View();`
			`auto st_v = st.View();`

			`if ( (Opt == WilsonKernelsStatic::OptGpu) && interior && exterior ) {`
			`const uint64_t nsimd = Simd::Nsimd();`
			`const uint64_t NN = NsiteLsnsimd;`
			`accelerator_loopN( sss, NN, {`
			`uint64_t cur = sss;`
			`// uint64_t lane = cur % nsimd;`
			`cur = cur / nsimd;`
			`uint64_t s = cur%Ls;`
			`uint64_t sF = cur; cur = cur / Ls;`
			`uint64_t sU = cur;`
			`WilsonKernels<Impl>::GpuDhopSiteDag(st_v,U_v,buf,Ls,s,sU,in_v,out_v);`
			`});`
			`} else {`
			`accelerator_loop( ss, U_v, {`
			`int sU = ss;`
			`int sF = Ls * sU;`
			`DhopSiteDag(Opt,st,U_v,st.CommBuf(),sF,sU,Ls,1,in_v,out_v);`
			`});`
			`}`
			`}`


GPU dslash updates 2018-06-27 22:32:21 +01:00			`/*`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_EMPTY(DomainWallVec5dImplF);`
			`GPU_EMPTY(DomainWallVec5dImplFH);`
			`GPU_EMPTY(DomainWallVec5dImplD);`
			`GPU_EMPTY(DomainWallVec5dImplDF);`
			`GPU_EMPTY(ZDomainWallVec5dImplF);`
			`GPU_EMPTY(ZDomainWallVec5dImplFH);`
			`GPU_EMPTY(ZDomainWallVec5dImplD);`
			`GPU_EMPTY(ZDomainWallVec5dImplDF);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`*/`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`FermOpTemplateInstantiate(WilsonKernels);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`AdjointFermOpTemplateInstantiate(WilsonKernels);`
			`TwoIndexFermOpTemplateInstantiate(WilsonKernels);`
Start of GPU kernels 2018-05-15 00:41:17 +01:00
			`NAMESPACE_END(Grid);`