Grid/lib/qcd/action/fermion/WilsonKernelsGpu.cc

/*************************************************************************************

Grid physics library, www.github.com/paboyle/Grid

Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc

Copyright (C) 2018

Author: Peter Boyle <paboyle@ph.ed.ac.uk>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/*  END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>

NAMESPACE_BEGIN(Grid);

//////////////////////////////////////////////////////////////
// Gpu implementation; thread loop is implicit ; move to header
//////////////////////////////////////////////////////////////
accelerator_inline void synchronise(void) 
{
#ifdef __CUDA_ARCH__
  __syncthreads();
#endif
  return;
}
accelerator_inline int get_my_lanes(int Nsimd) 
{
#ifdef __CUDA_ARCH__
  return 1;
#else 
  return Nsimd;
#endif
}
accelerator_inline int get_my_lane_offset(int Nsimd) 
{
#ifdef __CUDA_ARCH__
  return ( (threadIdx.x) % Nsimd);
#else
  return 0;
#endif
}


#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj)			\
  synchronise();							\
  if (SE->_is_local) {							\
    int mask = Nsimd >> (ptype + 1);					\
    int plane= SE->_permute ? (lane ^ mask) : lane;			\
    auto in_l = extractLane(plane,in[SE->_offset]);			\
    spProj(chi,in_l);							\
  } else {								\
    chi  = extractLane(lane,buf[SE->_offset]);				\
  }									\
  synchronise();

template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
						     SiteHalfSpinor *buf, int sF,
						     int sU, const FermionFieldView &in, FermionFieldView &out)
{
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;

  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);

  uint64_t lane_offset= get_my_lane_offset(Nsimd);
  uint64_t lanes      = get_my_lanes(Nsimd);

  StencilEntry *SE;
  int ptype;

#ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
  int lane = lane_offset; {
#endif
    SE = st.GetEntry(ptype, Xp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
    spReconXp(result, Uchi);

    SE = st.GetEntry(ptype, Yp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
    accumReconYp(result, Uchi);
      
    SE = st.GetEntry(ptype, Zp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
    accumReconZp(result, Uchi);

    SE = st.GetEntry(ptype, Tp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
    accumReconTp(result, Uchi);

    SE = st.GetEntry(ptype, Xm, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
    accumReconXm(result, Uchi);

    SE = st.GetEntry(ptype, Ym, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
    accumReconYm(result, Uchi);


    SE = st.GetEntry(ptype, Zm, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
    accumReconZm(result, Uchi);

    SE = st.GetEntry(ptype, Tm, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
    accumReconTm(result, Uchi);

    synchronise();
    insertLane (lane,out[sF],result);
  }
}

template <class Impl>
accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,
						  SiteHalfSpinor *buf, int sF,
						  int sU, const FermionFieldView &in, FermionFieldView &out) 
{
  typename SiteHalfSpinor::scalar_object chi;
  typename SiteHalfSpinor::scalar_object Uchi;
  typename SiteSpinor::scalar_object   result;
  typedef typename SiteSpinor::scalar_type scalar_type;
  typedef typename SiteSpinor::vector_type vector_type;

  constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);

  uint64_t lane_offset= get_my_lane_offset(Nsimd);
  uint64_t lanes      = get_my_lanes(Nsimd);

  StencilEntry *SE;
  int ptype;

#ifndef __CUDA_ARCH__
  for(int lane = lane_offset;lane<lane_offset+lanes;lane++){
#else
  int lane = lane_offset; {
#endif
    SE = st.GetEntry(ptype, Xp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);
    spReconXm(result, Uchi);

    SE = st.GetEntry(ptype, Yp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);
    accumReconYm(result, Uchi);
      
    SE = st.GetEntry(ptype, Zp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);
    accumReconZm(result, Uchi);

    SE = st.GetEntry(ptype, Tp, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);
    accumReconTm(result, Uchi);

    SE = st.GetEntry(ptype, Xm, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);
    accumReconXp(result, Uchi);

    SE = st.GetEntry(ptype, Ym, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);
    accumReconYp(result, Uchi);

    SE = st.GetEntry(ptype, Zm, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);
    accumReconZp(result, Uchi);

    SE = st.GetEntry(ptype, Tm, sF);
    GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp); 
    Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
    accumReconTp(result, Uchi);

    synchronise();
    insertLane (lane,out[sF],result);
  }

};

// Template specialise Gparity to empty for now
#define GPU_EMPTY(A)							\
  template <>								\
accelerator void							\
WilsonKernels<A>::GpuDhopSite(StencilView &st,				\
			      DoubledGaugeFieldView &U,			\
			      SiteHalfSpinor *buf, int sF,		\
			      int sU,					\
			      const FermionFieldView &in,		\
			      FermionFieldView &out) { assert(0);};	\
  template <>								\
  accelerator void							\
  WilsonKernels<A>::GpuDhopSiteDag(StencilView &st,			\
				DoubledGaugeFieldView &U,		\
				   SiteHalfSpinor *buf, int sF,		\
				int sU,					\
				const FermionFieldView &in,		\
				   FermionFieldView &out) { assert(0);};

GPU_EMPTY(GparityWilsonImplF);
GPU_EMPTY(GparityWilsonImplFH);
GPU_EMPTY(GparityWilsonImplD);
GPU_EMPTY(GparityWilsonImplDF);

/*
GPU_EMPTY(DomainWallVec5dImplF);
GPU_EMPTY(DomainWallVec5dImplFH);
GPU_EMPTY(DomainWallVec5dImplD);
GPU_EMPTY(DomainWallVec5dImplDF);
GPU_EMPTY(ZDomainWallVec5dImplF);
GPU_EMPTY(ZDomainWallVec5dImplFH);
GPU_EMPTY(ZDomainWallVec5dImplD);
GPU_EMPTY(ZDomainWallVec5dImplDF);
*/

FermOpTemplateInstantiate(WilsonKernels);
AdjointFermOpTemplateInstantiate(WilsonKernels);
TwoIndexFermOpTemplateInstantiate(WilsonKernels);

NAMESPACE_END(Grid);
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`/*************************************************************************************`

			`Grid physics library, www.github.com/paboyle/Grid`

			`Source file: ./lib/qcd/action/fermion/WilsonKernelsGpu.cc`

			`Copyright (C) 2018`

			`Author: Peter Boyle <paboyle@ph.ed.ac.uk>`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License along`
			`with this program; if not, write to the Free Software Foundation, Inc.,`
			`51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.`

			`See the full license in the file "LICENSE" in the top level distribution`
			`directory`
			`*************************************************************************************/`
			`/* END LEGAL */`
			`#include <Grid/qcd/action/fermion/FermionCore.h>`

			`NAMESPACE_BEGIN(Grid);`

			`//////////////////////////////////////////////////////////////`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`// Gpu implementation; thread loop is implicit ; move to header`
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`//////////////////////////////////////////////////////////////`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`accelerator_inline void synchronise(void)`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`{`
			`#ifdef __CUDA_ARCH__`
			`__syncthreads();`
			`#endif`
			`return;`
			`}`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`accelerator_inline int get_my_lanes(int Nsimd)`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`{`
			`#ifdef __CUDA_ARCH__`
			`return 1;`
			`#else`
			`return Nsimd;`
			`#endif`
			`}`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`accelerator_inline int get_my_lane_offset(int Nsimd)`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`{`
			`#ifdef __CUDA_ARCH__`
			`return ( (threadIdx.x) % Nsimd);`
			`#else`
			`return 0;`
			`#endif`
			`}`


GPU dslash updates 2018-06-27 22:32:21 +01:00			`#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`synchronise(); \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`if (SE->_is_local) { \`
			`int mask = Nsimd >> (ptype + 1); \`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`int plane= SE->_permute ? (lane ^ mask) : lane; \`
			`auto in_l = extractLane(plane,in[SE->_offset]); \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`spProj(chi,in_l); \`
			`} else { \`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`chi = extractLane(lane,buf[SE->_offset]); \`
			`} \`
			`synchronise();`
GPU dslash updates 2018-06-27 22:32:21 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`template <class Impl>`
			`accelerator void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`SiteHalfSpinor *buf, int sF,`
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`int sU, const FermionFieldView &in, FermionFieldView &out)`
			`{`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`typename SiteHalfSpinor::scalar_object chi;`
			`typename SiteHalfSpinor::scalar_object Uchi;`
			`typename SiteSpinor::scalar_object result;`
			`typedef typename SiteSpinor::scalar_type scalar_type;`
			`typedef typename SiteSpinor::vector_type vector_type;`
GPU dslash updates 2018-06-27 22:32:21 +01:00
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);`

			`uint64_t lane_offset= get_my_lane_offset(Nsimd);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`uint64_t lanes = get_my_lanes(Nsimd);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`StencilEntry *SE;`
			`int ptype;`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#ifndef __CUDA_ARCH__`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`for(int lane = lane_offset;lane<lane_offset+lanes;lane++){`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#else`
			`int lane = lane_offset; {`
			`#endif`
			`SE = st.GetEntry(ptype, Xp, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);`
			`spReconXp(result, Uchi);`

			`SE = st.GetEntry(ptype, Yp, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);`
			`accumReconYp(result, Uchi);`

			`SE = st.GetEntry(ptype, Zp, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);`
			`accumReconZp(result, Uchi);`

			`SE = st.GetEntry(ptype, Tp, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTp);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);`
			`accumReconTp(result, Uchi);`

			`SE = st.GetEntry(ptype, Xm, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);`
			`accumReconXm(result, Uchi);`

			`SE = st.GetEntry(ptype, Ym, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);`
			`accumReconYm(result, Uchi);`


			`SE = st.GetEntry(ptype, Zm, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);`
			`accumReconZm(result, Uchi);`

			`SE = st.GetEntry(ptype, Tm, sF);`
			`GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTm);`
			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);`
			`accumReconTm(result, Uchi);`

			`synchronise();`
			`insertLane (lane,out[sF],result);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`}`
			`}`
Start of GPU kernels 2018-05-15 00:41:17 +01:00
			`template <class Impl>`
			`accelerator void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, DoubledGaugeFieldView &U,`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`SiteHalfSpinor *buf, int sF,`
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`int sU, const FermionFieldView &in, FermionFieldView &out)`
			`{`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`typename SiteHalfSpinor::scalar_object chi;`
			`typename SiteHalfSpinor::scalar_object Uchi;`
			`typename SiteSpinor::scalar_object result;`
			`typedef typename SiteSpinor::scalar_type scalar_type;`
			`typedef typename SiteSpinor::vector_type vector_type;`

			`constexpr int Nsimd = sizeof(vector_type)/sizeof(scalar_type);`

			`uint64_t lane_offset= get_my_lane_offset(Nsimd);`
			`uint64_t lanes = get_my_lanes(Nsimd);`

Start of GPU kernels 2018-05-15 00:41:17 +01:00			`StencilEntry *SE;`
			`int ptype;`
GPU dslash updates 2018-06-27 22:32:21 +01:00
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#ifndef __CUDA_ARCH__`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`for(int lane = lane_offset;lane<lane_offset+lanes;lane++){`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`#else`
			`int lane = lane_offset; {`
			`#endif`
			`SE = st.GetEntry(ptype, Xp, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Xp,spProjXm);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`spReconXm(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`SE = st.GetEntry(ptype, Yp, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Yp,spProjYm);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Yp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconYm(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`SE = st.GetEntry(ptype, Zp, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Zp,spProjZm);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconZm(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`SE = st.GetEntry(ptype, Tp, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Tp,spProjTm);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tp);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconTm(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`SE = st.GetEntry(ptype, Xm, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Xm,spProjXp);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Xm);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconXp(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`SE = st.GetEntry(ptype, Ym, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Ym,spProjYp);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Ym);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconYp(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`SE = st.GetEntry(ptype, Zm, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Zm,spProjZp);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Zm);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconZp(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`SE = st.GetEntry(ptype, Tm, sF);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_COALESCED_STENCIL_LEG_PROJ(Tm,spProjTp);`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`accumReconTp(result, Uchi);`

Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00			`synchronise();`
			`insertLane (lane,out[sF],result);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`}`
Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 12:10:25 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`};`

Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`// Template specialise Gparity to empty for now`
			`#define GPU_EMPTY(A) \`
			`template <> \`
			`accelerator void \`
			`WilsonKernels<A>::GpuDhopSite(StencilView &st, \`
			`DoubledGaugeFieldView &U, \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`SiteHalfSpinor *buf, int sF, \`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`int sU, \`
			`const FermionFieldView &in, \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`FermionFieldView &out) { assert(0);}; \`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`template <> \`
			`accelerator void \`
			`WilsonKernels<A>::GpuDhopSiteDag(StencilView &st, \`
			`DoubledGaugeFieldView &U, \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`SiteHalfSpinor *buf, int sF, \`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`int sU, \`
			`const FermionFieldView &in, \`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`FermionFieldView &out) { assert(0);};`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
			`GPU_EMPTY(GparityWilsonImplF);`
			`GPU_EMPTY(GparityWilsonImplFH);`
			`GPU_EMPTY(GparityWilsonImplD);`
			`GPU_EMPTY(GparityWilsonImplDF);`
GPU dslash updates 2018-06-27 22:32:21 +01:00
			`/*`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00			`GPU_EMPTY(DomainWallVec5dImplF);`
			`GPU_EMPTY(DomainWallVec5dImplFH);`
			`GPU_EMPTY(DomainWallVec5dImplD);`
			`GPU_EMPTY(DomainWallVec5dImplDF);`
			`GPU_EMPTY(ZDomainWallVec5dImplF);`
			`GPU_EMPTY(ZDomainWallVec5dImplFH);`
			`GPU_EMPTY(ZDomainWallVec5dImplD);`
			`GPU_EMPTY(ZDomainWallVec5dImplDF);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`*/`
Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
Start of GPU kernels 2018-05-15 00:41:17 +01:00			`FermOpTemplateInstantiate(WilsonKernels);`
GPU dslash updates 2018-06-27 22:32:21 +01:00			`AdjointFermOpTemplateInstantiate(WilsonKernels);`
			`TwoIndexFermOpTemplateInstantiate(WilsonKernels);`
Start of GPU kernels 2018-05-15 00:41:17 +01:00
			`NAMESPACE_END(Grid);`