mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-25 13:15:55 +01:00
Runs multiGPU with coalesced access on tesseract
This commit is contained in:
parent
37336c9e0c
commit
9fe68857a9
@ -68,11 +68,20 @@ public:
|
|||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Compress includes precision change if mpi data is not same */
|
/* Compress includes precision change if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
|
#if 0
|
||||||
accelerator_inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
|
accelerator_inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
projector::Proj(tmp,in,mu,dag);
|
projector::Proj(tmp,in,mu,dag);
|
||||||
vstream(buf[o],tmp);
|
vstream(buf[o],tmp);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
template<class _SiteHalfSpinor, class _SiteSpinor>
|
||||||
|
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
|
||||||
|
_SiteHalfSpinor tmp;
|
||||||
|
projector::Proj(tmp,in,mu,dag);
|
||||||
|
vstream(buf[o],tmp);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Exchange includes precision change if mpi data is not same */
|
/* Exchange includes precision change if mpi data is not same */
|
||||||
@ -148,8 +157,9 @@ public:
|
|||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Compress includes precision change if mpi data is not same */
|
/* Compress includes precision change if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
accelerator_inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
|
template<class _SiteHalfSpinor, class _SiteSpinor>
|
||||||
SiteHalfSpinor hsp;
|
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
|
||||||
|
_SiteHalfSpinor hsp;
|
||||||
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
|
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
|
||||||
projector::Proj(hsp,in,mu,dag);
|
projector::Proj(hsp,in,mu,dag);
|
||||||
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
|
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
|
||||||
@ -395,6 +405,9 @@ public:
|
|||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
this->halogtime+=usecond();
|
this->halogtime+=usecond();
|
||||||
|
#ifdef GRID_NVCC
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -104,7 +104,6 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -122,7 +121,6 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
|||||||
GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -130,7 +128,6 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -148,7 +145,6 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
|||||||
GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Interior kernels
|
// Interior kernels
|
||||||
@ -158,7 +154,6 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -177,7 +172,6 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
|||||||
GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -185,7 +179,6 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -203,7 +196,6 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
|||||||
GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
|
GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
|
||||||
GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
|
GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Exterior kernels
|
// Exterior kernels
|
||||||
@ -213,7 +205,6 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
|
||||||
// SiteHalfSpinor tmp;
|
// SiteHalfSpinor tmp;
|
||||||
// SiteHalfSpinor chi;
|
// SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -234,7 +225,6 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
|||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
out[sF] = out[sF] + result;
|
out[sF] = out[sF] + result;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -242,7 +232,6 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
|
||||||
// SiteHalfSpinor tmp;
|
// SiteHalfSpinor tmp;
|
||||||
// SiteHalfSpinor chi;
|
// SiteHalfSpinor chi;
|
||||||
SiteHalfSpinor *chi_p;
|
SiteHalfSpinor *chi_p;
|
||||||
@ -263,14 +252,12 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
out[sF] = out[sF] + result;
|
out[sF] = out[sF] + result;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||||
{
|
{
|
||||||
#ifndef GRID_NVCC
|
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
SiteSpinor result;
|
SiteSpinor result;
|
||||||
@ -288,7 +275,6 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
|
|||||||
GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
|
GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
|
||||||
GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
|
GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
|
||||||
vstream(out[sF], result);
|
vstream(out[sF], result);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
|
@ -61,7 +61,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
{
|
{
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
|
||||||
uint4 * mem_pun = (uint4 *)mem;
|
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
|
||||||
uint4 * chip_pun = (uint4 *)&chip;
|
uint4 * chip_pun = (uint4 *)&chip;
|
||||||
* chip_pun = * mem_pun;
|
* chip_pun = * mem_pun;
|
||||||
#else
|
#else
|
||||||
@ -73,28 +73,24 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
#ifdef GPU_VEC
|
#ifdef GPU_VEC
|
||||||
#if 1
|
#if 1
|
||||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||||
synchronise(); \
|
|
||||||
if (SE._is_local) { \
|
if (SE._is_local) { \
|
||||||
int mask = Nsimd >> (ptype + 1); \
|
int mask = Nsimd >> (ptype + 1); \
|
||||||
int plane= SE._permute ? (lane ^ mask) : lane; \
|
int plane= SE._permute ? (lane ^ mask) : lane; \
|
||||||
auto in_l = extractLane(plane,in[SE._offset+s]); \
|
auto in_l = extractLane(plane,in[SE._offset+s]); \
|
||||||
spProj(chi,in_l); \
|
spProj(chi,in_l); \
|
||||||
} else { \
|
} else { \
|
||||||
chi = extractLane(lane,buf[SE._offset+s]); \
|
chi = extractLane(lane,buf[SE._offset+s]); \
|
||||||
} \
|
} \
|
||||||
synchronise();
|
synchronise();
|
||||||
#else
|
#else
|
||||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||||
{ int mask = Nsimd >> (ptype + 1); \
|
{ int mask = Nsimd >> (ptype + 1); \
|
||||||
int plane= SE._permute ? (lane ^ mask) : lane; \
|
int plane= SE._permute ? (lane ^ mask) : lane; \
|
||||||
synchronise(); \
|
|
||||||
auto in_l = extractLane(plane,in[SE._offset+s]); \
|
auto in_l = extractLane(plane,in[SE._offset+s]); \
|
||||||
synchronise(); \
|
|
||||||
spProj(chi,in_l); }
|
spProj(chi,in_l); }
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
#define GPU_COALESCED_STENCIL_LEG_PROJ(Dir,spProj) \
|
||||||
synchronise(); \
|
|
||||||
if (SE._is_local) { \
|
if (SE._is_local) { \
|
||||||
auto in_t = in[SE._offset+s]; \
|
auto in_t = in[SE._offset+s]; \
|
||||||
if (SE._permute) { \
|
if (SE._permute) { \
|
||||||
@ -111,8 +107,8 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int Ls, int s,
|
SiteHalfSpinor *buf, int Ls, int s,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
#ifdef GPU_VEC
|
#ifdef GPU_VEC
|
||||||
typename SiteHalfSpinor::scalar_object chi;
|
typename SiteHalfSpinor::scalar_object chi;
|
||||||
@ -182,7 +178,6 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSiteDag(StencilView &st, Dou
|
|||||||
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
|
Impl::multLinkGpu(lane,Uchi,U[sU],chi,Tm);
|
||||||
accumReconTm(result, Uchi);
|
accumReconTm(result, Uchi);
|
||||||
|
|
||||||
synchronise();
|
|
||||||
#ifdef GPU_VEC
|
#ifdef GPU_VEC
|
||||||
insertLane (lane,out[sF],result);
|
insertLane (lane,out[sF],result);
|
||||||
#else
|
#else
|
||||||
@ -268,7 +263,6 @@ accelerator_inline void WilsonKernels<Impl>::GpuDhopSite(StencilView &st, SiteDo
|
|||||||
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
Impl::multLinkGpu(lane,Uchi,U,chi,Tm);
|
||||||
accumReconTp(result, Uchi);
|
accumReconTp(result, Uchi);
|
||||||
|
|
||||||
synchronise();
|
|
||||||
#ifdef GPU_VEC
|
#ifdef GPU_VEC
|
||||||
insertLane (lane,out[sF],result);
|
insertLane (lane,out[sF],result);
|
||||||
#else
|
#else
|
||||||
|
Loading…
x
Reference in New Issue
Block a user