mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-19 08:17:05 +01:00
Fixed bug in padded staple code where extract was being called on the result before the GPU view was closed Fixed compile issue with pointer cast in padded staple code Added timing summaries of padded staple code and timing breakdown of staple implementation to Test_padded_cell_staple
201 lines
6.7 KiB
C++
201 lines
6.7 KiB
C++
/*************************************************************************************
|
|
|
|
Grid physics library, www.github.com/paboyle/Grid
|
|
|
|
Source file: ./lib/tensors/Tensor_SIMT.h
|
|
|
|
Copyright (C) 2015
|
|
|
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
See the full license in the file "LICENSE" in the top level distribution directory
|
|
*************************************************************************************/
|
|
/* END LEGAL */
|
|
#pragma once
|
|
|
|
#include <string.h>
|
|
|
|
NAMESPACE_BEGIN(Grid);
|
|
|
|
////////////////////////////////////////////////
|
|
// Inside a GPU thread
|
|
////////////////////////////////////////////////
|
|
template<class vobj>
|
|
accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type)
|
|
{
|
|
typedef decltype(coalescedRead(mp0)) sobj;
|
|
unsigned int Nsimd = vobj::Nsimd();
|
|
unsigned int mask = Nsimd >> (type + 1);
|
|
int lane = acceleratorSIMTlane(Nsimd);
|
|
int j0 = lane &(~mask); // inner coor zero
|
|
int j1 = lane |(mask) ; // inner coor one
|
|
const vobj *vpa = &vp0;
|
|
const vobj *vpb = &vp1;
|
|
const vobj *vp = (lane&mask) ? (vpb) : (vpa);
|
|
auto sa = coalescedRead(vp[0],j0);
|
|
auto sb = coalescedRead(vp[0],j1);
|
|
coalescedWrite(mp0,sa);
|
|
coalescedWrite(mp1,sb);
|
|
}
|
|
|
|
|
|
#ifndef GRID_SIMT
|
|
//////////////////////////////////////////
|
|
// Trivial mapping of vectors on host
|
|
//////////////////////////////////////////
|
|
template<class vobj> accelerator_inline
|
|
vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
|
|
{
|
|
return vec;
|
|
}
|
|
template<class vobj> accelerator_inline
|
|
vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=0)
|
|
{
|
|
if ( doperm ) {
|
|
vobj ret;
|
|
permute(ret,vec, ptype);
|
|
return ret;
|
|
} else {
|
|
return vec;
|
|
}
|
|
}
|
|
//'perm_mask' acts as a bitmask
|
|
template<class vobj> accelerator_inline
|
|
vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
|
|
{
|
|
auto obj = vec, tmp = vec;
|
|
for (int d=0;d<nd;d++)
|
|
if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
|
|
return obj;
|
|
}
|
|
|
|
template<class vobj> accelerator_inline
|
|
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
|
|
{
|
|
vec = extracted;
|
|
}
|
|
template<class vobj> accelerator_inline
|
|
void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
|
|
{
|
|
vstream(vec, extracted);
|
|
}
|
|
#else //==GRID_SIMT
|
|
|
|
|
|
//#ifndef GRID_SYCL
|
|
#if 1
|
|
// Use the scalar as our own complex on GPU ... thrust::complex or std::complex
|
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
|
typename vsimd::scalar_type
|
|
coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
|
{
|
|
typedef typename vsimd::scalar_type S;
|
|
S * __restrict__ p=(S *)&vec;
|
|
return p[lane];
|
|
}
|
|
template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
|
typename vsimd::scalar_type
|
|
coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
|
{
|
|
typedef typename vsimd::scalar_type S;
|
|
|
|
S * __restrict__ p=(S *)&vec;
|
|
int mask = vsimd::Nsimd() >> (ptype + 1);
|
|
int plane= doperm ? lane ^ mask : lane;
|
|
return p[plane];
|
|
}
|
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
|
void coalescedWrite(vsimd & __restrict__ vec,
|
|
const typename vsimd::scalar_type & __restrict__ extracted,
|
|
int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
|
{
|
|
typedef typename vsimd::scalar_type S;
|
|
S * __restrict__ p=(S *)&vec;
|
|
p[lane]=extracted;
|
|
}
|
|
#else
|
|
// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops
|
|
// Faster for some reason
|
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
|
typename vsimd::vector_type::datum
|
|
coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
|
{
|
|
typedef typename vsimd::vector_type::datum S;
|
|
S * __restrict__ p=(S *)&vec;
|
|
return p[lane];
|
|
}
|
|
template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
|
typename vsimd::vector_type::datum
|
|
coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
|
{
|
|
typedef typename vsimd::vector_type::datum S;
|
|
|
|
S * __restrict__ p=(S *)&vec;
|
|
int mask = vsimd::Nsimd() >> (ptype + 1);
|
|
int plane= doperm ? lane ^ mask : lane;
|
|
return p[plane];
|
|
}
|
|
template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
|
|
void coalescedWrite(vsimd & __restrict__ vec,
|
|
const typename vsimd::vector_type::datum & __restrict__ extracted,
|
|
int lane=acceleratorSIMTlane(vsimd::Nsimd()))
|
|
{
|
|
typedef typename vsimd::vector_type::datum S;
|
|
S * __restrict__ p=(S *)&vec;
|
|
p[lane]=extracted;
|
|
}
|
|
#endif
|
|
|
|
//////////////////////////////////////////
|
|
// Extract and insert slices on the GPU
|
|
//////////////////////////////////////////
|
|
template<class vobj> accelerator_inline
|
|
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
|
{
|
|
return extractLane(lane,vec);
|
|
}
|
|
template<class vobj> accelerator_inline
|
|
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
|
{
|
|
int mask = vobj::Nsimd() >> (ptype + 1);
|
|
int plane= doperm ? lane ^ mask : lane;
|
|
return extractLane(plane,vec);
|
|
}
|
|
template<class vobj> accelerator_inline
|
|
typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
|
{
|
|
int plane = lane;
|
|
for (int d=0;d<nd;d++)
|
|
plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
|
|
return extractLane(plane,vec);
|
|
}
|
|
template<class vobj> accelerator_inline
|
|
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
|
{
|
|
insertLane(lane,vec,extracted);
|
|
}
|
|
template<class vobj> accelerator_inline
|
|
void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
|
|
{
|
|
insertLane(lane,vec,extracted);
|
|
}
|
|
#endif
|
|
|
|
|
|
NAMESPACE_END(Grid);
|
|
|