1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-19 16:27:05 +01:00

Make view specify where and drive data motion - first cut.

This is a compile tiime option --enable-unified=yes/no
This commit is contained in:
Peter Boyle
2020-05-21 16:13:16 -04:00
parent ebb60330c9
commit 7860a50f70
48 changed files with 688 additions and 718 deletions

View File

@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h>

View File

@ -91,12 +91,16 @@ const lobj & eval(const uint64_t ss, const LatticeExprView<lobj> &arg)
{
return arg[ss];
}
// What needs this?
#if 1
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.View();
return view[ss];
}
#endif
///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand
@ -206,7 +210,7 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{
lat.AcceleratorViewOpen();
lat.ViewOpen(AcceleratorRead);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {}
@ -237,7 +241,7 @@ inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{
lat.AcceleratorViewClose();
lat.ViewClose();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {}

View File

@ -36,9 +36,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(AcceleratorRead);
conformable(ret,rhs);
conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@ -55,9 +55,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -72,9 +72,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -88,9 +88,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -107,8 +107,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs);
@ -120,8 +120,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -134,8 +134,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -147,8 +147,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
@ -164,8 +164,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -178,8 +178,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -192,8 +192,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -205,8 +205,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
@ -220,9 +220,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
auto ret_v = ret.View(AcceleratorWrite);
auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp);
@ -233,9 +233,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
auto ret_v = ret.View(AcceleratorWrite);
auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp);

View File

@ -28,6 +28,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#define STREAMING_STORES
@ -36,181 +37,6 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeExprView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
// Non accelerator functions
LatticeExprView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
~LatticeExprView(){}
void AcceleratorViewOpen(void)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
void *cpu_ptr=this->_odata;
// std::cout << "AccViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
this->_odata=(vobj *)AllocationCache::AccViewOpen(this->_odata,this->_odata_size*sizeof(vobj),1,0);
}
void AcceleratorViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->_odata<<std::dec <<std::endl;
AllocationCache::AccViewClose((void *)this->_odata);
}
void CpuViewOpen(void)
{ // Translate the pointer
void *cpu_ptr=this->_odata;
// std::cout << "CpuViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
this->_odata=(vobj *)AllocationCache::CpuViewOpen(cpu_ptr,this->_odata_size*sizeof(vobj),1,0);
}
void CpuViewClose(void)
{ // Inform the manager
// std::cout << "CpuViewClose"<<std::hex<<this->_odata<<std::dec <<std::endl;
AllocationCache::CpuViewClose((void *)this->_odata);
}
};
// UserView constructor,destructor updates view manager
// Non-copyable object??? Second base with copy/= deleted?
template<class vobj>
class LatticeView : public LatticeExprView<vobj>
{
public:
// Rvalue
/*
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
*/
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeExprView<vobj> (refer_to_me)
{
this->AcceleratorViewOpen();
}
~LatticeView(){
this->AcceleratorViewClose();
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeExprView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
/////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code
@ -253,14 +79,20 @@ private:
}
}
public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
}
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas
/////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
LatticeView<vobj> View (ViewMode mode) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
return accessor;
}
@ -286,7 +118,7 @@ public:
auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View();
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp);
@ -308,7 +140,7 @@ public:
auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View();
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp);
@ -329,7 +161,7 @@ public:
this->checkerboard=cb;
auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View();
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp);
@ -385,9 +217,9 @@ public:
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View();
thread_for(ss,me.size(),{
me[ss] = r;
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{
me[ss]= r;
});
return *this;
}
@ -398,11 +230,12 @@ public:
///////////////////////////////////////////
// user defined constructor
///////////////////////////////////////////
Lattice(GridBase *grid) {
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid;
resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0;
SetViewMode(mode);
}
// virtual ~Lattice(void) = default;
@ -418,7 +251,6 @@ public:
// copy constructor
///////////////////////////////////////////
Lattice(const Lattice& r){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid();
resize(this->_grid->oSites());
*this = r;
@ -441,8 +273,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View();
auto him= r.View();
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
@ -455,8 +287,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View();
auto him= r.View();
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});

View File

@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
auto lhs_v = lhs.View(CpuRead);
auto rhs_v = rhs.View(CpuRead);
auto ret_v = ret.View(CpuWrite);
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
});
@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
auto lhs_v = lhs.View(CpuRead);
auto ret_v = ret.View(CpuWrite);
thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs);
});
@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
auto rhs_v = rhs.View(CpuRead);
auto ret_v = ret.View(CpuWrite);
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]);
});

View File

@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid();
int Nsimd = grid->iSites();
auto l_v = l.View();
auto l_v = l.View(CpuWrite);
thread_for( o, grid->oSites(), {
vector_type vI;
Coordinate gcoor;
@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
});
};
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid);

View File

@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
auto rhs_v = rhs.View(AcceleratorRead);
auto ret_v = ret.View(AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
});
@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(AcceleratorRead);
auto ret_v = ret.View(AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
});
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(AcceleratorRead);
auto ret_v = ret.View(AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop

View File

@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto Y_v = Y.View();
auto R_v = R.View();
auto X_v = X.View(CpuRead);
auto Y_v = Y.View(CpuRead);
auto R_v = R.View(CpuWrite);
thread_region
{
std::vector<vobj> s_x(Nblock);
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto R_v = R.View();
auto X_v = X.View(CpuRead);
auto R_v = R.View(CpuWrite);
thread_region
{
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto lhs_v = lhs.View(CpuRead);
auto rhs_v = rhs.View(CpuRead);
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);

View File

@ -46,8 +46,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(CpuWrite);
auto lhs_v = lhs.View(CpuRead);
thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
});
@ -58,8 +58,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(CpuWrite);
auto lhs_v = lhs.View(CpuRead);
thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
});
@ -72,8 +72,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View(CpuRead);
auto lhs_v = lhs.View(CpuWrite);
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
});
@ -81,8 +81,8 @@ void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj()
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View(CpuRead);
auto lhs_v = lhs.View(CpuWrite);
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
});
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
auto l_v = l.View(CpuWrite);
if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf);
buf[idx] = s;
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
auto l_v = l.View(CpuWrite);
extract(l_v[odx],buf);
s = buf[idx];
@ -173,7 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
auto l_v = l.View(CpuRead);
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
auto l_v = l.View(CpuWrite);
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){

View File

@ -40,8 +40,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
auto lhs_v = lhs.View(AcceleratorRead);
auto ret_v = ret.View(AcceleratorWrite);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
});
@ -50,8 +50,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
auto lhs_v = lhs.View(AcceleratorRead);
auto ret_v = ret.View(AcceleratorWrite);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
});

View File

@ -76,7 +76,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto arg_v = arg.View();
auto arg_v = arg.View(AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites);
arg.Grid()->GlobalSum(ssum);
@ -102,8 +102,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.View();
auto right_v=right.View();
auto left_v = left.View(AcceleratorRead);
auto right_v=right.View(AcceleratorRead);
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
@ -167,9 +167,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid();
auto x_v=x.View();
auto y_v=y.View();
auto z_v=z.View();
auto x_v=x.View(AcceleratorRead);
auto y_v=y.View(AcceleratorRead);
auto z_v=z.View(AcceleratorWrite);
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
@ -271,7 +271,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
auto Data_v=Data.View();
auto Data_v=Data.View(CpuRead);
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
@ -349,8 +349,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View();
auto rhv=rhs.View();
auto lhv=lhs.View(CpuRead);
auto rhv=rhs.View(CpuRead);
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@ -457,14 +457,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av;
auto Rv=R.View();
auto Xv=X.View();
auto Yv=Y.View();
thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){
auto Rv=R.View(CpuWrite);
auto Xv=X.View(CpuRead);
auto Yv=Y.View(CpuRead);
thread_for2d( n, e1, b,e2, {
int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss];
}
});
}
};
@ -517,9 +515,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View();
auto Y_v=Y.View();
auto R_v=R.View();
auto X_v=X.View(CpuRead);
auto Y_v=Y.View(CpuRead);
auto R_v=R.View(CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
@ -564,13 +562,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View();
auto X_v = X.View();
auto R_v = R.View(CpuWrite);
auto X_v = X.View(CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
@ -628,8 +627,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View();
auto rhs_v=rhs.View();
auto lhs_v=lhs.View(CpuRead);
auto rhs_v=rhs.View(CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);

View File

@ -375,7 +375,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View();
auto l_v = l.View(CpuWrite);
thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times

View File

@ -41,8 +41,8 @@ template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
});
@ -56,8 +56,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
});

View File

@ -49,8 +49,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.Checkerboard() = cb;
auto half_v = half.View();
auto full_v = full.View();
auto half_v = half.View(CpuWrite);
auto full_v = full.View(CpuRead);
thread_for(ss, full.Grid()->oSites(),{
int cbos;
Coordinate coor;
@ -65,8 +65,8 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
}
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.Checkerboard();
auto half_v = half.View();
auto full_v = full.View();
auto half_v = half.View(CpuRead);
auto full_v = full.View(CpuWrite);
thread_for(ss,full.Grid()->oSites(),{
Coordinate coor;
@ -92,9 +92,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<CComplex> ip(coarse);
// auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
auto ip_ = ip.View();
auto coarseData_ = coarseData.View(AcceleratorWrite);
auto ip_ = ip.View(AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
blockInnerProduct(ip,Basis[v],fineData);
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
@ -102,7 +101,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
});
}
}
#if 0
template<class vobj,class CComplex,int nbasis>
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
@ -132,8 +131,8 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View(AcceleratorRead);
auto coarseData_ = coarseData.View(AcceleratorWrite);
////////////////////////////////////////////////////////////////////////////////////////////////////////
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
// Otherwise do fine inner product per site, and make the update atomic
@ -142,7 +141,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
auto sc=sci/nbasis;
auto i=sci%nbasis;
auto Basis_ = Basis[i].View();
auto Basis_ = Basis[i].View(AcceleratorRead);
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
@ -165,6 +164,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
});
return;
}
#endif
template<class vobj,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -191,10 +191,10 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
}
auto fineZ_ = fineZ.View();
auto fineX_ = fineX.View();
auto fineY_ = fineY.View();
auto coarseA_= coarseA.View();
auto fineZ_ = fineZ.View(AcceleratorWrite);
auto fineX_ = fineX.View(AcceleratorRead);
auto fineY_ = fineY.View(AcceleratorRead);
auto coarseA_= coarseA.View(AcceleratorRead);
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@ -227,11 +227,10 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse);
// Precision promotion?
auto CoarseInner_ = CoarseInner.View();
auto coarse_inner_ = coarse_inner.View();
fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner);
auto CoarseInner_ = CoarseInner.View(AcceleratorWrite);
auto coarse_inner_ = coarse_inner.View(AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss];
});
@ -266,8 +265,8 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
// Turn this around to loop threaded over sc and interior loop
// over sf would thread better
auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View(AcceleratorWrite);
auto fineData_ = fineData.View(AcceleratorRead);
accelerator_for(sc,coarse->oSites(),1,{
@ -360,8 +359,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
}
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View(AcceleratorWrite);
auto coarseData_ = coarseData.View(AcceleratorRead);
// Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{
@ -374,7 +373,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View();
/* auto basis_ = Basis[i].View( );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
}
@ -395,8 +394,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
Lattice<CComplex> cip(coarse);
auto cip_ = cip.View();
auto ip_ = ip.View();
auto cip_ = cip.View(AcceleratorWrite);
auto ip_ = ip.View(AcceleratorRead);
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
coalescedWrite(cip_[sc], ip_(sc)());
});
@ -470,8 +469,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
auto t_v = To.View();
auto f_v = From.View();
auto t_v = To.View(AcceleratorWrite);
auto f_v = From.View(AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
sobj s;
Coordinate Fcoor(nd);
@ -718,7 +717,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
}
//loop over outer index
auto in_v = in.View();
auto in_v = in.View(CpuRead);
thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@ -811,7 +810,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane);
}
auto out_v = out.View();
auto out_v = out.View(CpuWrite);
thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd);
@ -914,7 +913,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View();
auto out_v = out.View(CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);

View File

@ -41,8 +41,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
});
@ -56,8 +56,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
});

View File

@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
auto rhs = rhs_i.View(AcceleratorRead);
auto ret = ret_i.View(AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y);
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
}
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
auto rhs = rhs_i.View(AcceleratorRead);
auto ret = ret_i.View(AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y));
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View();
auto rhs = rhs_i.View();
auto ret = ret_i.View(AcceleratorWrite);
auto rhs = rhs_i.View(AcceleratorRead);
ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y));
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
auto rhs = rhs_i.View(AcceleratorRead);
auto ret = ret_i.View(AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));