1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Make view specify where and drive data motion - first cut.

This is a compile tiime option --enable-unified=yes/no
This commit is contained in:
Peter Boyle 2020-05-21 16:13:16 -04:00
parent ebb60330c9
commit 7860a50f70
48 changed files with 688 additions and 718 deletions

View File

@ -6,6 +6,7 @@
/////////////////// ///////////////////
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory>
#include <vector> #include <vector>
#include <array> #include <array>
#include <string> #include <string>

View File

@ -186,10 +186,10 @@ public:
hermop.HermOp(*Tn,y); hermop.HermOp(*Tn,y);
auto y_v = y.View(); auto y_v = y.View(AcceleratorWrite);
auto Tn_v = Tn->View(); auto Tn_v = Tn->View(AcceleratorWrite);
auto Tnp_v = Tnp->View(); auto Tnp_v = Tnp->View(AcceleratorWrite);
auto Tnm_v = Tnm->View(); auto Tnm_v = Tnm->View(AcceleratorWrite);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
@ -264,12 +264,12 @@ public:
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
comms_usec += usecond(); comms_usec += usecond();
auto in_v = in.View(); auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(); auto out_v = out.View(AcceleratorWrite);
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View()); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@ -343,11 +343,11 @@ public:
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View()); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
auto out_v = out.View(); auto out_v = out.View(AcceleratorWrite);
auto in_v = in.View(); auto in_v = in.View(AcceleratorRead);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0])) calcVector;
@ -542,10 +542,10 @@ public:
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi); blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
auto iZProj_v = iZProj.View() ; auto iZProj_v = iZProj.View(AcceleratorRead) ;
auto oZProj_v = oZProj.View() ; auto oZProj_v = oZProj.View(AcceleratorRead) ;
auto A_p = A[p].View(); auto A_p = A[p].View(AcceleratorWrite);
auto A_self = A[self_stencil].View(); auto A_self = A[self_stencil].View(AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
// if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });} // if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
@ -563,11 +563,11 @@ public:
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
{ {
auto tmp_ = tmp.View(); auto tmp_ = tmp.View(AcceleratorWrite);
auto evenmask_ = evenmask.View(); auto evenmask_ = evenmask.View(AcceleratorRead);
auto oddmask_ = oddmask.View(); auto oddmask_ = oddmask.View(AcceleratorRead);
auto Mphie_ = Mphie.View(); auto Mphie_ = Mphie.View(AcceleratorRead);
auto Mphio_ = Mphio.View(); auto Mphio_ = Mphio.View(AcceleratorRead);
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
}); });
@ -575,8 +575,8 @@ public:
blockProject(SelfProj,tmp,Subspace.subspace); blockProject(SelfProj,tmp,Subspace.subspace);
auto SelfProj_ = SelfProj.View(); auto SelfProj_ = SelfProj.View(AcceleratorRead);
auto A_self = A[self_stencil].View(); auto A_self = A[self_stencil].View(AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
for(int j=0;j<nbasis;j++){ for(int j=0;j<nbasis;j++){

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -191,7 +190,7 @@ public:
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View(); auto pgbuf_v = pgbuf.View(CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;

View File

@ -122,9 +122,9 @@ class BiCGSTAB : public OperatorFunction<Field>
LinearCombTimer.Start(); LinearCombTimer.Start();
bo = beta * omega; bo = beta * omega;
auto p_v = p.View(); auto p_v = p.View(AcceleratorWrite);
auto r_v = r.View(); auto r_v = r.View(AcceleratorWrite);
auto v_v = v.View(); auto v_v = v.View(AcceleratorWrite);
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
}); });
@ -142,13 +142,13 @@ class BiCGSTAB : public OperatorFunction<Field>
alpha = rho / Calpha.real(); alpha = rho / Calpha.real();
LinearCombTimer.Start(); LinearCombTimer.Start();
auto h_v = h.View(); auto h_v = h.View(AcceleratorWrite);
auto psi_v = psi.View(); auto psi_v = psi.View(AcceleratorWrite);
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
}); });
auto s_v = s.View(); auto s_v = s.View(AcceleratorWrite);
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
}); });
@ -166,7 +166,7 @@ class BiCGSTAB : public OperatorFunction<Field>
omega = Comega.real() / norm2(t); omega = Comega.real() / norm2(t);
LinearCombTimer.Start(); LinearCombTimer.Start();
auto t_v = t.View(); auto t_v = t.View(AcceleratorWrite);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));

View File

@ -140,9 +140,9 @@ public:
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
auto psi_v = psi.View(); auto psi_v = psi.View(AcceleratorWrite);
auto p_v = p.View(); auto p_v = p.View(AcceleratorWrite);
auto r_v = r.View(); auto r_v = r.View(AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));

View File

@ -57,17 +57,17 @@ void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
template<class Field> template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].View()) View; typedef decltype(basis[0].View(CpuWrite)) View;
auto tmp_v = basis[0].View(); auto tmp_v = basis[0].View(CpuWrite);
Vector<View> basis_v(basis.size(),tmp_v); Vector<View> basis_v(basis.size(),tmp_v);
View *basis_vp = &basis_v[0]; View *basis_vp = &basis_v[0];
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View(); basis_v[k] = basis[k].View(CpuWrite);
} }
#if 0 #if 1
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
thread_region thread_region
{ {
@ -149,16 +149,16 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
template<class Field> template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].View()) View; typedef decltype(basis[0].View(AcceleratorWrite)) View;
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.View(); auto result_v=result.View(AcceleratorWrite);
Vector<View> basis_v(basis.size(),result_v); Vector<View> basis_v(basis.size(),result_v);
View * basis_vp = &basis_v[0]; View * basis_vp = &basis_v[0];
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View(); basis_v[k] = basis[k].View(AcceleratorRead);
} }
Vector<double> Qt_jv(Nm); Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];

View File

@ -12,7 +12,7 @@ bool MemoryProfiler::debug = false;
#define SMALL_LIMIT (4096) #define SMALL_LIMIT (4096)
#endif #endif
#ifdef POINTER_CACHE #ifdef ALLOCATION_CACHE
int PointerCache::victim; int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];

View File

@ -22,8 +22,10 @@ void *AllocationCache::AcceleratorAllocate(size_t bytes)
{ {
void *ptr = (void *) Lookup(bytes,Acc); void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes); ptr = (void *) acceleratorAllocDevice(bytes);
// std::cout <<"AcceleratorAllocate: allocated Accelerator pointer "<<std::hex<<ptr<<std::endl;
}
return ptr; return ptr;
} }
@ -31,7 +33,7 @@ void AllocationCache::AcceleratorFree (void *ptr,size_t bytes)
{ {
void *__freeme = Insert(ptr,bytes,Acc); void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) acceleratorFreeShared(__freeme); if ( __freeme ) acceleratorFreeDevice(__freeme);
} }
void *AllocationCache::CpuAllocate(size_t bytes) void *AllocationCache::CpuAllocate(size_t bytes)
{ {
@ -39,9 +41,7 @@ void *AllocationCache::CpuAllocate(size_t bytes)
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes); ptr = (void *) acceleratorAllocShared(bytes);
// std::cout <<"CpuAllocate: allocated pointer "<<std::hex<<ptr<<std::endl; // std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::endl;
} else {
// std::cout <<"CpuAllocate: cached pointer "<<std::hex<<ptr<<std::endl;
} }
return ptr; return ptr;
@ -50,7 +50,7 @@ void AllocationCache::CpuFree (void *ptr,size_t bytes)
{ {
// Look up in ViewCache // Look up in ViewCache
int e=CpuViewLookup(ptr); int e=CpuViewLookup(ptr);
if(e>=0){ Evict(e); } if(e>=0){ Discard(e); }
// If present remove entry and free accelerator too. // If present remove entry and free accelerator too.
// Can we ever hit a free event with a view still in scope? // Can we ever hit a free event with a view still in scope?
@ -90,13 +90,18 @@ void AllocationCache::Init(void)
Ncache[AccSmall]=Nc; Ncache[AccSmall]=Nc;
} }
} }
std::cout << "MemoryManager::Init() SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
} }
void *AllocationCache::Insert(void *ptr,size_t bytes,int type) void *AllocationCache::Insert(void *ptr,size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small; int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]); return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
#else
return ptr;
#endif
} }
void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{ {
@ -136,9 +141,13 @@ void *AllocationCache::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entri
void *AllocationCache::Lookup(size_t bytes,int type) void *AllocationCache::Lookup(size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small; int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]); return Lookup(bytes,Entries[cache],Ncache[cache]);
#else
return NULL;
#endif
} }
void *AllocationCache::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) void *AllocationCache::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{ {

View File

@ -32,11 +32,38 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h? // Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE #undef ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024) #define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096) #define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/ /*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Reegular data
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class AllocationCache { class AllocationCache {
private: private:
@ -70,19 +97,23 @@ private:
static void *AcceleratorAllocate(size_t bytes); static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes); static void AcceleratorFree (void *ptr,size_t bytes);
static int ViewVictim(void); static int ViewVictim(void);
static void CpuDiscard(int e);
static void Discard(int e);
static void Evict(int e); static void Evict(int e);
static void Flush(int e); static void Flush(int e);
static void Clone(int e); static void Clone(int e);
static int CpuViewLookup(void *CpuPtr); static int CpuViewLookup(void *CpuPtr);
static int AccViewLookup(void *AccPtr); // static int AccViewLookup(void *AccPtr);
static void AcceleratorViewClose(void* AccPtr);
static void *AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(void* Ptr);
static void *CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
public: public:
static void Init(void); static void Init(void);
static void AccViewClose(void* AccPtr); static void ViewClose(void* AccPtr,ViewMode mode);
static void CpuViewClose(void* CpuPtr); static void *ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void *AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
static void *CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient);
static void *CpuAllocate(size_t bytes); static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes); static void CpuFree (void *ptr,size_t bytes);

View File

@ -1,9 +1,9 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UNIFIED #ifndef GRID_UVM
#warning "Using explicit device memory copies" #warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define dprintf(...) #define dprintf
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
@ -20,15 +20,12 @@ typedef struct {
uint32_t cpuLock; uint32_t cpuLock;
} AcceleratorViewEntry; } AcceleratorViewEntry;
#define Write (1)
#define Read (2)
#define WriteDiscard (3)
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Data tables for ViewCache // Data tables for ViewCache
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
static AcceleratorViewEntry AccCache[NaccCacheMax]; static AcceleratorViewEntry AccCache[NaccCacheMax];
static int AccCacheVictim; // Base for round robin search static int AccCacheVictim; // Base for round robin search
static int NaccCache = 8; static int NaccCache = 32;
//////////////////////////////////// ////////////////////////////////////
// Priority ordering for unlocked entries // Priority ordering for unlocked entries
@ -68,7 +65,7 @@ int AllocationCache::ViewVictim(void)
if ( locks==0 ) { if ( locks==0 ) {
if( s==Empty ) { prioEmpty = e; dprintf("Empty");} if( s==Empty ) { prioEmpty = e; dprintf("Empty"); }
if( t == EvictNext ) { if( t == EvictNext ) {
if( s==CpuDirty ) { prioCpuDirtyEN = e; dprintf("CpuDirty Transient");} if( s==CpuDirty ) { prioCpuDirtyEN = e; dprintf("CpuDirty Transient");}
@ -97,21 +94,42 @@ int AllocationCache::ViewVictim(void)
if ( prioEmpty >= 0 ) victim = prioEmpty; /*Highest prio is winner*/ if ( prioEmpty >= 0 ) victim = prioEmpty; /*Highest prio is winner*/
assert(victim >= 0); // Must succeed/ assert(victim >= 0); // Must succeed/
dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim); dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim);
// advance victim pointer // advance victim pointer
AccCacheVictim=(AccCacheVictim+1)%NaccCache; AccCacheVictim=(AccCacheVictim+1)%NaccCache;
dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache); dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache);
return victim; return victim;
} }
///////////////////////////////////////////////// /////////////////////////////////////////////////
// Accelerator cache motion // Accelerator cache motion
///////////////////////////////////////////////// /////////////////////////////////////////////////
void AllocationCache::Discard(int e) // remove from Accelerator, remove entry, without flush
{
if(AccCache[e].state!=Empty){
dprintf("AllocationCache: Discard(%d) %llx,%llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
assert(AccCache[e].accLock==0);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr) {
dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
}
}
AccCache[e].AccPtr=NULL;
AccCache[e].CpuPtr=NULL;
AccCache[e].bytes=0;
AccCache[e].state=Empty;
AccCache[e].accLock=0;
AccCache[e].cpuLock=0;
}
void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry
{ {
if(AccCache[e].state!=Empty){ if(AccCache[e].state!=Empty){
dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); dprintf("AllocationCache: Evict(%d) %llx,%llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr);
assert(AccCache[e].accLock==0); assert(AccCache[e].accLock==0);
assert(AccCache[e].cpuLock==0); assert(AccCache[e].cpuLock==0);
if(AccCache[e].state==AccDirty) { if(AccCache[e].state==AccDirty) {
@ -119,7 +137,7 @@ void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerat
} }
assert(AccCache[e].CpuPtr!=NULL); assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr) { if(AccCache[e].AccPtr) {
dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr); dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr);
AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes); AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes);
} }
} }
@ -132,7 +150,7 @@ void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerat
} }
void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove
{ {
dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); // printf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout);
assert(AccCache[e].state==AccDirty); assert(AccCache[e].state==AccDirty);
assert(AccCache[e].cpuLock==0); assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock==0); assert(AccCache[e].accLock==0);
@ -150,14 +168,50 @@ void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if
if(AccCache[e].AccPtr==NULL){ if(AccCache[e].AccPtr==NULL){
AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes); AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
} }
dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); // printf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout);
acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes); acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
AccCache[e].state=Consistent; AccCache[e].state=Consistent;
} }
void AllocationCache::CpuDiscard(int e)// Mark accelerator dirty without copy. Allocate if necessary
{
assert(AccCache[e].state!=Empty);
assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock==0);
assert(AccCache[e].CpuPtr!=NULL);
if(AccCache[e].AccPtr==NULL){
AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes);
}
// printf("AllocationCache: CpuDiscard(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout);
// acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes);
AccCache[e].state=AccDirty;
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// View management // View management
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) void AllocationCache::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose(Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose(Ptr);
} else {
assert(0);
}
}
void *AllocationCache::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return nullptr;
}
}
void *AllocationCache::AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty // Find if present, otherwise get or force an empty
@ -165,9 +219,11 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
int e=CpuViewLookup(CpuPtr); int e=CpuViewLookup(CpuPtr);
if(e==-1) { if(e==-1) {
e = ViewVictim(); e = ViewVictim();
dprintf("AcceleratorViewOpen Victim is %d\n",e);
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
} }
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache[e].cpuLock==0); // Programming error assert(AccCache[e].cpuLock==0); // Programming error
if(AccCache[e].state!=Empty) { if(AccCache[e].state!=Empty) {
@ -193,35 +249,50 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
AccCache[e].AccPtr = NULL; AccCache[e].AccPtr = NULL;
AccCache[e].bytes = bytes; AccCache[e].bytes = bytes;
AccCache[e].state = CpuDirty; // Cpu starts primary AccCache[e].state = CpuDirty; // Cpu starts primary
Clone(e); if(mode==AcceleratorWriteDiscard){
if(mode==Write) CpuDiscard(e);
AccCache[e].state = AccDirty; // Empty + AccWrite=> AccDirty AccCache[e].state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
else } else if(mode==AcceleratorWrite){
Clone(e);
AccCache[e].state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(e);
AccCache[e].state = Consistent; // Empty + AccRead => Consistent AccCache[e].state = Consistent; // Empty + AccRead => Consistent
}
AccCache[e].accLock= 1; AccCache[e].accLock= 1;
} else if(AccCache[e].state&CpuDirty ){ // printf("Copied Empy entry %d into device accLock %d\n",e,AccCache[e].accLock);
Clone(e); } else if(AccCache[e].state==CpuDirty ){
if(mode==Write) if(mode==AcceleratorWriteDiscard) {
AccCache[e].state = AccDirty; // CpuDirty + AccWrite=> AccDirty CpuDiscard(e);
else AccCache[e].state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(e);
AccCache[e].state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(e);
AccCache[e].state = Consistent; // CpuDirty + AccRead => Consistent AccCache[e].state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache[e].accLock++; AccCache[e].accLock++;
} else if(AccCache[e].state&Consistent) { // printf("Copied CpuDirty entry %d into device accLock %d\n",e,AccCache[e].accLock);
if(mode==Write) } else if(AccCache[e].state==Consistent) {
AccCache[e].state = AccDirty; // Consistent + AccWrite=> AccDirty if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache[e].state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache[e].state = Consistent; // Consistent + AccRead => Consistent AccCache[e].state = Consistent; // Consistent + AccRead => Consistent
AccCache[e].accLock++; AccCache[e].accLock++;
} else if(AccCache[e].state&AccDirty) { // printf("Consistent entry %d into device accLock %d\n",e,AccCache[e].accLock);
if(mode==Write) } else if(AccCache[e].state==AccDirty) {
AccCache[e].state = AccDirty; // AccDirty + AccWrite=> AccDirty if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache[e].state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache[e].state = AccDirty; // AccDirty + AccRead => AccDirty AccCache[e].state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache[e].accLock++; AccCache[e].accLock++;
// printf("AccDirty entry %d into device accLock %d\n",e,AccCache[e].accLock);
} else { } else {
assert(0); assert(0);
} }
int transient =hint;
AccCache[e].transient= transient? EvictNext : 0; AccCache[e].transient= transient? EvictNext : 0;
return AccCache[e].AccPtr; return AccCache[e].AccPtr;
@ -241,12 +312,18 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
//////////////////////////////////// ////////////////////////////////////
// look up & decrement lock count // look up & decrement lock count
//////////////////////////////////// ////////////////////////////////////
void AllocationCache::AccViewClose(void* AccPtr) void AllocationCache::AcceleratorViewClose(void* AccPtr)
{ {
int e=AccViewLookup(AccPtr); int e=CpuViewLookup(AccPtr);
// printf("AccView close %d lock %d \n",e,AccCache[e].accLock);
if(e==-1) exit(0);
if(AccCache[e].cpuLock!=0) exit(0);
if(AccCache[e].accLock==0) exit(0);
/*
assert(e!=-1); assert(e!=-1);
assert(AccCache[e].cpuLock==0); assert(AccCache[e].cpuLock==0);
assert(AccCache[e].accLock>0); assert(AccCache[e].accLock>0);
*/
AccCache[e].accLock--; AccCache[e].accLock--;
} }
void AllocationCache::CpuViewClose(void* CpuPtr) void AllocationCache::CpuViewClose(void* CpuPtr)
@ -257,7 +334,7 @@ void AllocationCache::CpuViewClose(void* CpuPtr)
assert(AccCache[e].accLock==0); assert(AccCache[e].accLock==0);
AccCache[e].cpuLock--; AccCache[e].cpuLock--;
} }
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{ {
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty // Find if present, otherwise get or force an empty
@ -265,9 +342,11 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
int e=CpuViewLookup(CpuPtr); int e=CpuViewLookup(CpuPtr);
if(e==-1) { if(e==-1) {
e = ViewVictim(); e = ViewVictim();
dprintf("CpuViewOpen Victim is %d\n",e);
Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty
} }
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache[e].accLock==0); // Programming error assert(AccCache[e].accLock==0); // Programming error
if(AccCache[e].state!=Empty) { if(AccCache[e].state!=Empty) {
@ -288,7 +367,7 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
AccCache[e].cpuLock++; AccCache[e].cpuLock++;
} else if(AccCache[e].state==Consistent) { } else if(AccCache[e].state==Consistent) {
assert(AccCache[e].AccPtr != NULL); assert(AccCache[e].AccPtr != NULL);
if(mode==Write) if(mode==CpuWrite)
AccCache[e].state = CpuDirty; // Consistent +CpuWrite => CpuDirty AccCache[e].state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else else
AccCache[e].state = Consistent; // Consistent +CpuRead => Consistent AccCache[e].state = Consistent; // Consistent +CpuRead => Consistent
@ -296,7 +375,7 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi
} else if(AccCache[e].state==AccDirty) { } else if(AccCache[e].state==AccDirty) {
assert(AccCache[e].AccPtr != NULL); assert(AccCache[e].AccPtr != NULL);
Flush(e); Flush(e);
if(mode==Write) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush if(mode==CpuWrite) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache[e].state = Consistent; // AccDirty +CpuRead => Consistent, Flush else AccCache[e].state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache[e].cpuLock++; AccCache[e].cpuLock++;
} else { } else {
@ -321,16 +400,6 @@ int AllocationCache::CpuViewLookup(void *CpuPtr)
} }
return -1; return -1;
} }
int AllocationCache::AccViewLookup(void *AccPtr)
{
assert(AccPtr!=NULL);
for(int e=0;e<NaccCache;e++){
if ( (AccCache[e].state!=Empty) && (AccCache[e].AccPtr==AccPtr) ) {
return e;
}
}
return -1;
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,5 +1,5 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifdef GRID_UNIFIED #ifdef GRID_UVM
#warning "Grid is assuming unified virtual memory address space" #warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -7,21 +7,22 @@ NAMESPACE_BEGIN(Grid);
// View management is 1:1 address space mapping // View management is 1:1 address space mapping
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; } void AllocationCache::AcceleratorViewClose(void* AccPtr){};
void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; } void *AllocationCache::AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }
void AllocationCache::AccViewClose(void* AccPtr){} void AllocationCache::CpuViewClose(void* Ptr){};
void AllocationCache::CpuViewClose(void* CpuPtr){} void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }
int AllocationCache::CpuViewLookup(void *CpuPtr){ return 0;}
///////////////////////////////////// /////////////////////////////////////
// Dummy stubs // Dummy stubs
///////////////////////////////////// /////////////////////////////////////
int AllocationCache::ViewVictim(void) { assert(0); return 0;} void AllocationCache::CpuDiscard(int e) { return;}
void AllocationCache::Evict(int e) { assert(0);} void AllocationCache::Discard(int e) { return;}
void AllocationCache::Flush(int e) { assert(0);} void AllocationCache::Evict(int e) { return; }
void AllocationCache::Clone(int e) { assert(0);} void AllocationCache::Flush(int e) { assert(0);}
void AllocationCache::Clone(int e) { assert(0);}
int AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;} int AllocationCache::ViewVictim(void) { assert(0); return 0;}
int AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;} void AllocationCache::ViewClose(void* AccPtr,ViewMode mode){};
void *AllocationCache::ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){return CpuPtr;};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -52,7 +52,6 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -73,6 +72,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
} }
} }
} }
auto rhs_v = rhs.View(AcceleratorRead);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
@ -100,7 +100,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
if ( cbmask ==0x3){ if ( cbmask ==0x3){
accelerator_for2d(n,e1,b,e2,1,{ accelerator_for2d(n,e1,b,e2,1,{
int o = n*n1; int o = n*n1;
@ -179,7 +179,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
} }
} }
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorWrite);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
@ -204,7 +204,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorWrite);
accelerator_for2d(n,e1,b,e2,1,{ accelerator_for2d(n,e1,b,e2,1,{
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension];
@ -216,7 +216,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
@ -272,8 +272,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
} }
} }
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorWrite);
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
lhs_v[table[i].first]=rhs_v[table[i].second]; lhs_v[table[i].first]=rhs_v[table[i].second];
@ -315,8 +315,8 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}} }}
} }
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorWrite);
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);

View File

@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h> #include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h> #include <Grid/lattice/Lattice_ET.h>

View File

@ -91,12 +91,16 @@ const lobj & eval(const uint64_t ss, const LatticeExprView<lobj> &arg)
{ {
return arg[ss]; return arg[ss];
} }
// What needs this?
#if 1
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{ {
auto view = arg.View(); auto view = arg.View();
return view[ss]; return view[ss];
} }
#endif
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand // handle nodes in syntax tree- eval one operand
@ -206,7 +210,7 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{ {
lat.AcceleratorViewOpen(); lat.ViewOpen(AcceleratorRead);
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {} inline void ExpressionViewOpen(T1 &notlat) {}
@ -237,7 +241,7 @@ inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{ {
lat.AcceleratorViewClose(); lat.ViewClose();
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {} inline void ExpressionViewClose(T1 &notlat) {}

View File

@ -36,9 +36,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@ -55,9 +55,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -72,9 +72,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -88,9 +88,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -107,8 +107,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs); mult(&tmp,&lhs_v(ss),&rhs);
@ -120,8 +120,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -134,8 +134,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -147,8 +147,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -164,8 +164,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -178,8 +178,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -192,8 +192,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -205,8 +205,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -220,9 +220,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss); auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -233,9 +233,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@ -28,6 +28,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#define STREAMING_STORES #define STREAMING_STORES
@ -36,181 +37,6 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeExprView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
// Non accelerator functions
LatticeExprView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
~LatticeExprView(){}
void AcceleratorViewOpen(void)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
void *cpu_ptr=this->_odata;
// std::cout << "AccViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
this->_odata=(vobj *)AllocationCache::AccViewOpen(this->_odata,this->_odata_size*sizeof(vobj),1,0);
}
void AcceleratorViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->_odata<<std::dec <<std::endl;
AllocationCache::AccViewClose((void *)this->_odata);
}
void CpuViewOpen(void)
{ // Translate the pointer
void *cpu_ptr=this->_odata;
// std::cout << "CpuViewOpen "<<std::hex<<this->_odata <<std::dec<<std::endl;
this->_odata=(vobj *)AllocationCache::CpuViewOpen(cpu_ptr,this->_odata_size*sizeof(vobj),1,0);
}
void CpuViewClose(void)
{ // Inform the manager
// std::cout << "CpuViewClose"<<std::hex<<this->_odata<<std::dec <<std::endl;
AllocationCache::CpuViewClose((void *)this->_odata);
}
};
// UserView constructor,destructor updates view manager
// Non-copyable object??? Second base with copy/= deleted?
template<class vobj>
class LatticeView : public LatticeExprView<vobj>
{
public:
// Rvalue
/*
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
*/
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeExprView<vobj> (refer_to_me)
{
this->AcceleratorViewOpen();
}
~LatticeView(){
this->AcceleratorViewClose();
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeExprView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics. // The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code // This contains extra (host resident) grid pointer data that may be accessed by host code
@ -253,14 +79,20 @@ private:
} }
} }
public: public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const LatticeView<vobj> View (ViewMode mode) const
{ {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this)); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
return accessor; return accessor;
} }
@ -286,7 +118,7 @@ public:
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(); auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
@ -308,7 +140,7 @@ public:
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(); auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
@ -329,7 +161,7 @@ public:
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(); auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
@ -385,9 +217,9 @@ public:
} }
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View(); auto me = View(AcceleratorWriteDiscard);
thread_for(ss,me.size(),{ accelerator_for(ss,me.size(),1,{
me[ss] = r; me[ss]= r;
}); });
return *this; return *this;
} }
@ -398,11 +230,12 @@ public:
/////////////////////////////////////////// ///////////////////////////////////////////
// user defined constructor // user defined constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode);
} }
// virtual ~Lattice(void) = default; // virtual ~Lattice(void) = default;
@ -418,7 +251,6 @@ public:
// copy constructor // copy constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(const Lattice& r){ Lattice(const Lattice& r){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid(); this->_grid = r.Grid();
resize(this->_grid->oSites()); resize(this->_grid->oSites());
*this = r; *this = r;
@ -441,8 +273,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = View(); auto me = View(AcceleratorWriteDiscard);
auto him= r.View(); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
@ -455,8 +287,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = View(); auto me = View(AcceleratorWriteDiscard);
auto him= r.View(); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });

View File

@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(CpuRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(CpuRead);
auto ret_v = ret.View(); auto ret_v = ret.View(CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
}); });
@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vPredicate> ret(lhs.Grid()); Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(CpuRead);
auto ret_v = ret.View(); auto ret_v = ret.View(CpuWrite);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs); ret_v[ss]=op(lhs_v[ss],rhs);
}); });
@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(CpuRead);
auto ret_v = ret.View(); auto ret_v = ret.View(CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]); ret_v[ss]=op(lhs,rhs_v[ss]);
}); });

View File

@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid(); GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
auto l_v = l.View(); auto l_v = l.View(CpuWrite);
thread_for( o, grid->oSites(), { thread_for( o, grid->oSites(), {
vector_type vI; vector_type vI;
Coordinate gcoor; Coordinate gcoor;
@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
}); });
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
}); });
@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
}); });
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr; typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),1,{ accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer // FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop // Use vector [] operator and don't read coalesce this loop

View File

@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); auto X_v = X.View(CpuRead);
auto Y_v = Y.View(); auto Y_v = Y.View(CpuRead);
auto R_v = R.View(); auto R_v = R.View(CpuWrite);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); auto X_v = X.View(CpuRead);
auto R_v = R.View(); auto R_v = R.View(CpuWrite);
thread_region thread_region
{ {
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(CpuRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(CpuRead);
thread_region { thread_region {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);

View File

@ -46,8 +46,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); auto ret_v = ret.View(CpuWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(CpuRead);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
}); });
@ -58,8 +58,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); auto ret_v = ret.View(CpuWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(CpuRead);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
}); });
@ -72,8 +72,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{ {
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(CpuRead);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(CpuWrite);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
}); });
@ -81,8 +81,8 @@ void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj()
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{ {
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(CpuRead);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(CpuWrite);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
}); });
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); auto l_v = l.View(CpuWrite);
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf); extract(l_v[odx],buf);
buf[idx] = s; buf[idx] = s;
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); auto l_v = l.View(CpuWrite);
extract(l_v[odx],buf); extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@ -173,7 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); auto l_v = l.View(CpuRead);
scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); auto l_v = l.View(CpuWrite);
scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){

View File

@ -40,8 +40,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss))); coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}); });
@ -50,8 +50,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
}); });

View File

@ -76,7 +76,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
auto arg_v = arg.View(); auto arg_v = arg.View(AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites); auto ssum= sum(&arg_v[0],osites);
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
@ -102,8 +102,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
// Might make all code paths go this way. // Might make all code paths go this way.
auto left_v = left.View(); auto left_v = left.View(AcceleratorRead);
auto right_v=right.View(); auto right_v=right.View(AcceleratorRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
@ -167,9 +167,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
auto x_v=x.View(); auto x_v=x.View(AcceleratorRead);
auto y_v=y.View(); auto y_v=y.View(AcceleratorRead);
auto z_v=z.View(); auto z_v=z.View(AcceleratorWrite);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
@ -271,7 +271,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
auto Data_v=Data.View(); auto Data_v=Data.View(CpuRead);
thread_for( r,rd, { thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
@ -349,8 +349,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View(); auto lhv=lhs.View(CpuRead);
auto rhv=rhs.View(); auto rhv=rhs.View(CpuRead);
thread_for( r,rd,{ thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@ -457,14 +457,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
auto Rv=R.View(); auto Rv=R.View(CpuWrite);
auto Xv=X.View(); auto Xv=X.View(CpuRead);
auto Yv=Y.View(); auto Yv=Y.View(CpuRead);
thread_for_collapse(2, n, e1, { thread_for2d( n, e1, b,e2, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
}
}); });
} }
}; };
@ -517,9 +515,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View(); auto X_v=X.View(CpuRead);
auto Y_v=Y.View(); auto Y_v=Y.View(CpuRead);
auto R_v=R.View(); auto R_v=R.View(CpuWrite);
thread_region thread_region
{ {
Vector<vobj> s_x(Nblock); Vector<vobj> s_x(Nblock);
@ -564,13 +562,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog]; int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View(); auto R_v = R.View(CpuWrite);
auto X_v = X.View(); auto X_v = X.View(CpuRead);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@ -628,8 +627,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View(); auto lhs_v=lhs.View(CpuRead);
auto rhs_v=rhs.View(); auto rhs_v=rhs.View(CpuRead);
thread_region thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);

View File

@ -375,7 +375,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View(); auto l_v = l.View(CpuWrite);
thread_for( ss, osites, { thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd); ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times

View File

@ -41,8 +41,8 @@ template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss))); coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
}); });
@ -56,8 +56,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
}); });

View File

@ -49,8 +49,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.Checkerboard() = cb; half.Checkerboard() = cb;
auto half_v = half.View(); auto half_v = half.View(CpuWrite);
auto full_v = full.View(); auto full_v = full.View(CpuRead);
thread_for(ss, full.Grid()->oSites(),{ thread_for(ss, full.Grid()->oSites(),{
int cbos; int cbos;
Coordinate coor; Coordinate coor;
@ -65,8 +65,8 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.Checkerboard(); int cb = half.Checkerboard();
auto half_v = half.View(); auto half_v = half.View(CpuRead);
auto full_v = full.View(); auto full_v = full.View(CpuWrite);
thread_for(ss,full.Grid()->oSites(),{ thread_for(ss,full.Grid()->oSites(),{
Coordinate coor; Coordinate coor;
@ -92,9 +92,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<CComplex> ip(coarse); Lattice<CComplex> ip(coarse);
// auto fineData_ = fineData.View(); auto coarseData_ = coarseData.View(AcceleratorWrite);
auto coarseData_ = coarseData.View(); auto ip_ = ip.View(AcceleratorWrite);
auto ip_ = ip.View();
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
blockInnerProduct(ip,Basis[v],fineData); blockInnerProduct(ip,Basis[v],fineData);
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
@ -102,7 +101,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
}); });
} }
} }
#if 0
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis>
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData, const Lattice<vobj> &fineData,
@ -132,8 +131,8 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
coarseData=Zero(); coarseData=Zero();
auto fineData_ = fineData.View(); auto fineData_ = fineData.View(AcceleratorRead);
auto coarseData_ = coarseData.View(); auto coarseData_ = coarseData.View(AcceleratorWrite);
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
// Otherwise do fine inner product per site, and make the update atomic // Otherwise do fine inner product per site, and make the update atomic
@ -142,7 +141,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
auto sc=sci/nbasis; auto sc=sci/nbasis;
auto i=sci%nbasis; auto i=sci%nbasis;
auto Basis_ = Basis[i].View(); auto Basis_ = Basis[i].View(AcceleratorRead);
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
@ -165,6 +164,7 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
}); });
return; return;
} }
#endif
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -191,10 +191,10 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
auto fineZ_ = fineZ.View(); auto fineZ_ = fineZ.View(AcceleratorWrite);
auto fineX_ = fineX.View(); auto fineX_ = fineX.View(AcceleratorRead);
auto fineY_ = fineY.View(); auto fineY_ = fineY.View(AcceleratorRead);
auto coarseA_= coarseA.View(); auto coarseA_= coarseA.View(AcceleratorRead);
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@ -227,11 +227,10 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.View();
auto coarse_inner_ = coarse_inner.View();
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
auto CoarseInner_ = CoarseInner.View(AcceleratorWrite);
auto coarse_inner_ = coarse_inner.View(AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, { accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss]; CoarseInner_[ss] = coarse_inner_[ss];
}); });
@ -266,8 +265,8 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
// Turn this around to loop threaded over sc and interior loop // Turn this around to loop threaded over sc and interior loop
// over sf would thread better // over sf would thread better
auto coarseData_ = coarseData.View(); auto coarseData_ = coarseData.View(AcceleratorWrite);
auto fineData_ = fineData.View(); auto fineData_ = fineData.View(AcceleratorRead);
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
@ -360,8 +359,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
auto fineData_ = fineData.View(); auto fineData_ = fineData.View(AcceleratorWrite);
auto coarseData_ = coarseData.View(); auto coarseData_ = coarseData.View(AcceleratorRead);
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{ accelerator_for(sf,fine->oSites(),1,{
@ -374,7 +373,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View(); /* auto basis_ = Basis[i].View( );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]); if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]); else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
} }
@ -395,8 +394,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i); Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
Lattice<CComplex> cip(coarse); Lattice<CComplex> cip(coarse);
auto cip_ = cip.View(); auto cip_ = cip.View(AcceleratorWrite);
auto ip_ = ip.View(); auto ip_ = ip.View(AcceleratorRead);
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
coalescedWrite(cip_[sc], ip_(sc)()); coalescedWrite(cip_[sc], ip_(sc)());
}); });
@ -470,8 +469,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions; Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
auto t_v = To.View(); auto t_v = To.View(AcceleratorWrite);
auto f_v = From.View(); auto f_v = From.View(AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{ accelerator_for(idx,Fg->lSites(),1,{
sobj s; sobj s;
Coordinate Fcoor(nd); Coordinate Fcoor(nd);
@ -718,7 +717,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
} }
//loop over outer index //loop over outer index
auto in_v = in.View(); auto in_v = in.View(CpuRead);
thread_for(in_oidx,in_grid->oSites(),{ thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd); ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@ -811,7 +810,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
thread_for(oidx, grid->oSites(),{ thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd); ExtractPointerArray<sobj> ptrs(nsimd);
@ -914,7 +913,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
std::vector<SobjOut> in_slex_conv(in_grid->lSites()); std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in); unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{ thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim); Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx); out_grid->oCoorFromOindex(out_ocoor, out_oidx);

View File

@ -41,8 +41,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
}); });
@ -56,8 +56,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); auto ret_v = ret.View(AcceleratorWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.View(AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
}); });

View File

@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); auto rhs = rhs_i.View(AcceleratorRead);
auto ret = ret_i.View(); auto ret = ret_i.View(AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{ accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y); ret[ss]=pow(rhs[ss],y);
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); auto rhs = rhs_i.View(AcceleratorRead);
auto ret = ret_i.View(); auto ret = ret_i.View(AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y)); coalescedWrite(ret[ss],mod(rhs(ss),y));
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View(); auto ret = ret_i.View(AcceleratorWrite);
auto rhs = rhs_i.View(); auto rhs = rhs_i.View(AcceleratorRead);
ret.Checkerboard() = rhs_i.Checkerboard(); ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y)); coalescedWrite(ret[ss],div(rhs(ss),y));
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); auto rhs = rhs_i.View(AcceleratorRead);
auto ret = ret_i.View(); auto ret = ret_i.View(AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));

View File

@ -233,10 +233,10 @@ public:
Uconj = where(coor==neglink,-Uconj,Uconj); Uconj = where(coor==neglink,-Uconj,Uconj);
} }
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto Uds_v = Uds.View(); auto Uds_v = Uds.View(CpuWrite);
auto Uconj_v = Uconj.View(); auto Uconj_v = Uconj.View(CpuRead);
auto Utmp_v= Utmp.View(); auto Utmp_v= Utmp.View(CpuWrite);
thread_foreach(ss,U_v,{ thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss](); Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss](); Uds_v[ss](1)(mu) = Uconj_v[ss]();
@ -272,8 +272,8 @@ public:
GaugeLinkField link(mat.Grid()); GaugeLinkField link(mat.Grid());
// use lorentz for flavour as hack. // use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
auto link_v = link.View(); auto link_v = link.View(CpuWrite);
auto tmp_v = tmp.View(); auto tmp_v = tmp.View(CpuRead);
thread_foreach(ss,tmp_v,{ thread_foreach(ss,tmp_v,{
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
}); });
@ -306,9 +306,9 @@ public:
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); auto tmp_v = tmp.View(CpuWrite);
auto Atilde_v = Atilde.View(); auto Atilde_v = Atilde.View(CpuRead);
auto Btilde_v = Btilde.View(); auto Btilde_v = Btilde.View(CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{ thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss; int sF = s + Ls * ss;

View File

@ -264,8 +264,8 @@ private:
{ {
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); auto T_v = T.View(CpuWrite);
auto F_v = F.View(); auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(), thread_for(i, CloverTerm.Grid()->oSites(),
{ {
T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
@ -282,8 +282,8 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); auto T_v = T.View(CpuWrite);
auto F_v = F.View(); auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(), thread_for(i, CloverTerm.Grid()->oSites(),
{ {
T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(0, 1) = -F_v[i]()();
@ -300,8 +300,8 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); auto T_v = T.View(CpuWrite);
auto F_v = F.View(); auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(), thread_for(i, CloverTerm.Grid()->oSites(),
{ {
T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
@ -318,8 +318,8 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); auto T_v = T.View(CpuWrite);
auto F_v = F.View(); auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(), thread_for(i, CloverTerm.Grid()->oSites(),
{ {
T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(0, 1) = timesI(F_v[i]()());
@ -336,8 +336,8 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); auto T_v = T.View(CpuWrite);
auto F_v = F.View(); auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(), thread_for(i, CloverTerm.Grid()->oSites(),
{ {
T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(0, 1) = -(F_v[i]()());
@ -355,8 +355,8 @@ private:
T = Zero(); T = Zero();
auto T_v = T.View(); auto T_v = T.View(CpuWrite);
auto F_v = F.View(); auto F_v = F.View(CpuRead);
thread_for(i, CloverTerm.Grid()->oSites(), thread_for(i, CloverTerm.Grid()->oSites(),
{ {
T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(0, 0) = timesI(F_v[i]()());

View File

@ -106,9 +106,9 @@ public:
const _SpinorField & phi, const _SpinorField & phi,
int mu) int mu)
{ {
auto out_v= out.View(); auto out_v= out.View(CpuWrite);
auto phi_v= phi.View(); auto phi_v= phi.View(CpuRead);
auto Umu_v= Umu.View(); auto Umu_v= Umu.View(CpuRead);
thread_for(sss,out.Grid()->oSites(),{ thread_for(sss,out.Grid()->oSites(),{
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
}); });
@ -191,9 +191,9 @@ public:
int Ls=Btilde.Grid()->_fdimensions[0]; int Ls=Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); auto tmp_v = tmp.View(CpuWrite);
auto Btilde_v = Btilde.View(); auto Btilde_v = Btilde.View(CpuRead);
auto Atilde_v = Atilde.View(); auto Atilde_v = Atilde.View(CpuRead);
thread_for(sss,tmp.Grid()->oSites(),{ thread_for(sss,tmp.Grid()->oSites(),{
int sU=sss; int sU=sss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){

View File

@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
{ {
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
int Ls=this->Ls; int Ls=this->Ls;
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
int Ls=this->Ls; int Ls=this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
auto plee = & lee [0]; auto plee = & lee [0];
auto pdee = & dee [0]; auto pdee = & dee [0];

View File

@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); auto psi = psi_i.View(CpuRead);
auto phi = phi_i.View(); auto phi = phi_i.View(CpuRead);
auto chi = chi_i.View(); auto chi = chi_i.View(CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
const int nsimd= Simd::Nsimd(); const int nsimd= Simd::Nsimd();
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi=psi_i.View(); auto psi=psi_i.View(CpuRead);
auto phi=phi_i.View(); auto phi=phi_i.View(CpuRead);
auto chi=chi_i.View(); auto chi=chi_i.View(CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd(); int nsimd= Simd::Nsimd();
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
Vector<iSinglet<Simd> > &Matm) Vector<iSinglet<Simd> > &Matm)
{ {
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
auto psi = psi_i.View(); auto psi = psi_i.View(CpuRead);
auto chi = chi_i.View(); auto chi = chi_i.View(CpuWrite);
#ifndef AVX512 #ifndef AVX512
{ {
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
EnableIf<Impl::LsVectorised,int> sfinae=0; EnableIf<Impl::LsVectorised,int> sfinae=0;
#ifndef AVX512 #ifndef AVX512
{ {
auto psi = psi_i.View(); auto psi = psi_i.View(CpuRead);
auto chi = chi_i.View(); auto chi = chi_i.View(CpuWrite);
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM; SiteHalfSpinor BcastM;
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
} }
#else #else
{ {
auto psi = psi_i.View(); auto psi = psi_i.View(CpuRead);
auto chi = chi_i.View(); auto chi = chi_i.View(CpuWrite);
// pointers // pointers
// MASK_REGS; // MASK_REGS;
#define Chi_00 %zmm0 #define Chi_00 %zmm0

View File

@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
int Ls = this->Ls; int Ls = this->Ls;
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi=psi_i.View(); auto psi=psi_i.View(AcceleratorRead);
auto chi=chi_i.View(); auto chi=chi_i.View(AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];

View File

@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
auto Umu_v = Umu.View(); auto Umu_v = Umu.View(CpuRead);
auto UUUmu_v = UUUmu.View(); auto UUUmu_v = UUUmu.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
thread_for( ss,Umu.Grid()->oSites(),{ thread_for( ss,Umu.Grid()->oSites(),{
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
@ -339,10 +339,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
} }
// do the compute // do the compute
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(); auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) { if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) { for (int ss = myblock; ss < myblock+myn; ++ss) {
@ -376,10 +376,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DhopComputeTime2-=usecond(); DhopComputeTime2-=usecond();
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(); auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) { if (dag == DaggerYes) {
int sz=st.surface_list.size(); int sz=st.surface_list.size();
thread_for( ss,sz,{ thread_for( ss,sz,{
@ -418,10 +418,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(); auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) { if (dag == DaggerYes) {
thread_for( ss,U.Grid()->oSites(),{ thread_for( ss,U.Grid()->oSites(),{
int sU=ss; int sU=ss;

View File

@ -250,10 +250,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
//////////////////////// ////////////////////////
// Call the single hop // Call the single hop
//////////////////////// ////////////////////////
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(); auto UUU_v = UUU.View(CpuRead);
auto B_v = B.View(); auto B_v = B.View(CpuWrite);
auto Btilde_v = Btilde.View(); auto Btilde_v = Btilde.View(CpuWrite);
thread_for(sss,B.Grid()->oSites(),{ thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
}); });
@ -378,10 +378,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in, compressor); Stencil.HaloExchange(in, compressor);
auto Umu_v = Umu.View(); auto Umu_v = Umu.View(CpuRead);
auto UUUmu_v = UUUmu.View(); auto UUUmu_v = UUUmu.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
thread_for( sss, in.Grid()->oSites(),{ thread_for( sss, in.Grid()->oSites(),{
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
}); });
@ -449,10 +449,10 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
} }
// do the compute // do the compute
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(); auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) { if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) { for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss; int sU = ss;
@ -479,10 +479,10 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopComputeTime2 -= usecond(); DhopComputeTime2 -= usecond();
{ {
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(); auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
if (dag == DaggerYes) { if (dag == DaggerYes) {
int sz=st.surface_list.size(); int sz=st.surface_list.size();
thread_for(ss,sz,{ thread_for(ss,sz,{
@ -520,10 +520,10 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond(); DhopCommTime += usecond();
auto U_v = U.View(); auto U_v = U.View(CpuRead);
auto UUU_v = UUU.View(); auto UUU_v = UUU.View(CpuRead);
auto in_v = in.View(); auto in_v = in.View(CpuRead);
auto out_v = out.View(); auto out_v = out.View(CpuWrite);
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
thread_for(sss, in.Grid()->oSites(),{ thread_for(sss, in.Grid()->oSites(),{

View File

@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto phi = phi_i.View(); auto phi = phi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
auto plee = & this->lee [0]; auto plee = & this->lee [0];
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
auto psi = psi_i.View(); auto psi = psi_i.View(AcceleratorRead);
auto chi = chi_i.View(); auto chi = chi_i.View(AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto pm = this->pm; auto pm = this->pm;

View File

@ -475,12 +475,12 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
// Inefficient comms method but not performance critical. // Inefficient comms method but not performance critical.
tmp1 = Cshift(q_in_1, mu, 1); tmp1 = Cshift(q_in_1, mu, 1);
tmp2 = Cshift(q_in_2, mu, 1); tmp2 = Cshift(q_in_2, mu, 1);
auto tmp1_v = tmp1.View(); auto tmp1_v = tmp1.View(CpuWrite);
auto tmp2_v = tmp2.View(); auto tmp2_v = tmp2.View(CpuWrite);
auto q_in_1_v=q_in_1.View(); auto q_in_1_v=q_in_1.View(CpuRead);
auto q_in_2_v=q_in_2.View(); auto q_in_2_v=q_in_2.View(CpuRead);
auto q_out_v = q_out.View(); auto q_out_v = q_out.View(CpuRead);
auto Umu_v = Umu.View(); auto Umu_v = Umu.View(CpuRead);
thread_for(sU, Umu.Grid()->oSites(),{ thread_for(sU, Umu.Grid()->oSites(),{
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU], Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
q_in_2_v[sU], q_in_2_v[sU],
@ -526,11 +526,11 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = lattice_cmplx*q_in; tmp = lattice_cmplx*q_in;
tmpBwd = Cshift(tmp, mu, -1); tmpBwd = Cshift(tmp, mu, -1);
auto coords_v = coords.View(); auto coords_v = coords.View(CpuRead);
auto tmpFwd_v = tmpFwd.View(); auto tmpFwd_v = tmpFwd.View(CpuRead);
auto tmpBwd_v = tmpBwd.View(); auto tmpBwd_v = tmpBwd.View(CpuRead);
auto Umu_v = Umu.View(); auto Umu_v = Umu.View(CpuRead);
auto q_out_v = q_out.View(); auto q_out_v = q_out.View(CpuWrite);
thread_for(sU, Umu.Grid()->oSites(), { thread_for(sU, Umu.Grid()->oSites(), {

View File

@ -348,18 +348,18 @@ template <class Impl>
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
int Nsite, const FermionField &in, std::vector<FermionField> &out) int Nsite, const FermionField &in, std::vector<FermionField> &out)
{ {
auto U_v = U.View(); auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(); auto in_v = in.View(AcceleratorRead);
auto st_v = st.View(); auto st_v = st.View(AcceleratorRead);
auto out_Xm = out[0].View(); auto out_Xm = out[0].View(AcceleratorWrite);
auto out_Ym = out[1].View(); auto out_Ym = out[1].View(AcceleratorWrite);
auto out_Zm = out[2].View(); auto out_Zm = out[2].View(AcceleratorWrite);
auto out_Tm = out[3].View(); auto out_Tm = out[3].View(AcceleratorWrite);
auto out_Xp = out[4].View(); auto out_Xp = out[4].View(AcceleratorWrite);
auto out_Yp = out[5].View(); auto out_Yp = out[5].View(AcceleratorWrite);
auto out_Zp = out[6].View(); auto out_Zp = out[6].View(AcceleratorWrite);
auto out_Tp = out[7].View(); auto out_Tp = out[7].View(AcceleratorWrite);
auto CBp=st.CommBuf(); auto CBp=st.CommBuf();
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
int sU=sss/Ls; int sU=sss/Ls;
@ -383,10 +383,10 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
assert(dirdisp<=7); assert(dirdisp<=7);
assert(dirdisp>=0); assert(dirdisp>=0);
auto U_v = U.View(); auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(); auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(); auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(); auto st_v = st.View(AcceleratorRead);
auto CBp=st.CommBuf(); auto CBp=st.CommBuf();
#define LoopBody(Dir) \ #define LoopBody(Dir) \
case Dir : \ case Dir : \
@ -438,10 +438,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior) int interior,int exterior)
{ {
auto U_v = U.View(); auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(); auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(); auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(); auto st_v = st.View(AcceleratorRead);
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
@ -469,10 +469,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior) int interior,int exterior)
{ {
auto U_v = U.View(); auto U_v = U.View(AcceleratorRead);
auto in_v = in.View(); auto in_v = in.View(AcceleratorRead);
auto out_v = out.View(); auto out_v = out.View(AcceleratorWrite);
auto st_v = st.View(); auto st_v = st.View(AcceleratorRead);
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}

View File

@ -86,8 +86,8 @@ public:
// Move this elsewhere? FIXME // Move this elsewhere? FIXME
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
auto U_v = U.View(); auto U_v = U.View(CpuWrite);
auto W_v = W.View(); auto W_v = W.View(CpuRead);
thread_for( ss, U.Grid()->oSites(), { thread_for( ss, U.Grid()->oSites(), {
U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
}); });
@ -131,8 +131,8 @@ public:
//static std::chrono::duration<double> diff; //static std::chrono::duration<double> diff;
//auto start = std::chrono::high_resolution_clock::now(); //auto start = std::chrono::high_resolution_clock::now();
auto U_v = U.View(); auto U_v = U.View(CpuWrite);
auto P_v = P.View(); auto P_v = P.View(CpuRead);
thread_for(ss, P.Grid()->oSites(),{ thread_for(ss, P.Grid()->oSites(),{
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));

View File

@ -89,8 +89,8 @@ public:
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared; action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
auto p_v = p.View(); auto p_v = p.View(CpuRead);
auto action_v = action.View(); auto action_v = action.View(CpuWrite);
for (int mu = 0; mu < Ndim; mu++) for (int mu = 0; mu < Ndim; mu++)
{ {
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
@ -146,8 +146,8 @@ public:
for (int point = 0; point < npoint; point++) for (int point = 0; point < npoint; point++)
{ {
auto p_v = p.View(); auto p_v = p.View(CpuRead);
auto force_v = force.View(); auto force_v = force.View(CpuWrite);
int permute_type; int permute_type;
StencilEntry *SE; StencilEntry *SE;

View File

@ -49,7 +49,7 @@ public:
private: private:
const unsigned int smearingLevels; const unsigned int smearingLevels;
Smear_Stout<Gimpl> StoutSmearing; Smear_Stout<Gimpl> *StoutSmearing;
std::vector<GaugeField> SmearedSet; std::vector<GaugeField> SmearedSet;
// Member functions // Member functions
@ -72,7 +72,7 @@ private:
previous_u = *ThinLinks; previous_u = *ThinLinks;
for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
{ {
StoutSmearing.smear(SmearedSet[smearLvl], previous_u); StoutSmearing->smear(SmearedSet[smearLvl], previous_u);
previous_u = SmearedSet[smearLvl]; previous_u = SmearedSet[smearLvl];
// For debug purposes // For debug purposes
@ -93,7 +93,7 @@ private:
GaugeLinkField SigmaKPrime_mu(grid); GaugeLinkField SigmaKPrime_mu(grid);
GaugeLinkField GaugeKmu(grid), Cmu(grid); GaugeLinkField GaugeKmu(grid), Cmu(grid);
StoutSmearing.BaseSmear(C, GaugeK); StoutSmearing->BaseSmear(C, GaugeK);
SigmaK = Zero(); SigmaK = Zero();
iLambda = Zero(); iLambda = Zero();
@ -107,7 +107,7 @@ private:
pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
pokeLorentz(iLambda, iLambda_mu, mu); pokeLorentz(iLambda, iLambda_mu, mu);
} }
StoutSmearing.derivative(SigmaK, iLambda, StoutSmearing->derivative(SigmaK, iLambda,
GaugeK); // derivative of SmearBase GaugeK); // derivative of SmearBase
return SigmaK; return SigmaK;
} }
@ -144,14 +144,14 @@ private:
// Exponential // Exponential
iQ2 = iQ * iQ; iQ2 = iQ * iQ;
iQ3 = iQ * iQ2; iQ3 = iQ * iQ2;
StoutSmearing.set_uw(u, w, iQ2, iQ3); StoutSmearing->set_uw(u, w, iQ2, iQ3);
StoutSmearing.set_fj(f0, f1, f2, u, w); StoutSmearing->set_fj(f0, f1, f2, u, w);
e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2; e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
// Getting B1, B2, Gamma and Lambda // Getting B1, B2, Gamma and Lambda
// simplify this part, reduntant calculations in set_fj // simplify this part, reduntant calculations in set_fj
xi0 = StoutSmearing.func_xi0(w); xi0 = StoutSmearing->func_xi0(w);
xi1 = StoutSmearing.func_xi1(w); xi1 = StoutSmearing->func_xi1(w);
u2 = u * u; u2 = u * u;
w2 = w * w; w2 = w * w;
cosw = cos(w); cosw = cos(w);
@ -219,7 +219,7 @@ public:
/* Standard constructor */ /* Standard constructor */
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear, SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
Smear_Stout<Gimpl>& Stout) Smear_Stout<Gimpl>& Stout)
: smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) : smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL)
{ {
for (unsigned int i = 0; i < smearingLevels; ++i) for (unsigned int i = 0; i < smearingLevels; ++i)
SmearedSet.push_back(*(new GaugeField(UGrid))); SmearedSet.push_back(*(new GaugeField(UGrid)));
@ -227,7 +227,7 @@ public:
/*! For just thin links */ /*! For just thin links */
SmearedConfiguration() SmearedConfiguration()
: smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}
// attach the smeared routines to the thin links U and fill the smeared set // attach the smeared routines to the thin links U and fill the smeared set
void set_Field(GaugeField &U) void set_Field(GaugeField &U)

View File

@ -185,13 +185,13 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
for(int i=0;i<Lblock;i++){ for(int i=0;i<Lblock;i++){
auto lhs_v = lhs_wi[i].View(); auto lhs_v = lhs_wi[i].View(CpuRead);
auto left = conjugate(lhs_v[ss]); auto left = conjugate(lhs_v[ss]);
for(int j=0;j<Rblock;j++){ for(int j=0;j<Rblock;j++){
SpinMatrix_v vv; SpinMatrix_v vv;
auto rhs_v = rhs_vj[j].View(); auto rhs_v = rhs_vj[j].View(CpuRead);
auto right = rhs_v[ss]; auto right = rhs_v[ss];
for(int s1=0;s1<Ns;s1++){ for(int s1=0;s1<Ns;s1++){
for(int s2=0;s2<Ns;s2++){ for(int s2=0;s2<Ns;s2++){
@ -204,7 +204,7 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
for ( int m=0;m<Nmom;m++){ for ( int m=0;m<Nmom;m++){
int idx = m+base; int idx = m+base;
auto mom_v = mom[m].View(); auto mom_v = mom[m].View(CpuRead);
auto phase = mom_v[ss]; auto phase = mom_v[ss];
mac(&lvSum[idx],&vv,&phase); mac(&lvSum[idx],&vv,&phase);
} }
@ -371,7 +371,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
for(int i=0;i<Lblock;i++){ for(int i=0;i<Lblock;i++){
auto wi_v = wi[i].View(); auto wi_v = wi[i].View(CpuRead);
auto w = conjugate(wi_v[ss]); auto w = conjugate(wi_v[ss]);
if (g5) { if (g5) {
w()(2)(0) = - w()(2)(0); w()(2)(0) = - w()(2)(0);
@ -383,7 +383,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
} }
for(int j=0;j<Rblock;j++){ for(int j=0;j<Rblock;j++){
auto vj_v=vj[j].View(); auto vj_v=vj[j].View(CpuRead);
auto v = vj_v[ss]; auto v = vj_v[ss];
auto vv = v()(0)(0); auto vv = v()(0)(0);
@ -518,12 +518,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
for(int i=0;i<Lblock;i++){ for(int i=0;i<Lblock;i++){
auto wi_v = wi[i].View(); auto wi_v = wi[i].View(CpuRead);
auto w = conjugate(wi_v[ss]); auto w = conjugate(wi_v[ss]);
for(int j=0;j<Rblock;j++){ for(int j=0;j<Rblock;j++){
auto vj_v = vj[j].View(); auto vj_v = vj[j].View(CpuRead);
auto v = vj_v[ss]; auto v = vj_v[ss];
auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
@ -544,7 +544,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
for ( int m=0;m<Nmom;m++){ for ( int m=0;m<Nmom;m++){
int idx = m+base; int idx = m+base;
auto mom_v = mom[m].View(); auto mom_v = mom[m].View(CpuRead);
auto phase = mom_v[ss]; auto phase = mom_v[ss];
mac(&lvSum[idx],&vv,&phase()()()); mac(&lvSum[idx],&vv,&phase()()());
} }
@ -730,13 +730,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
for(int i=0;i<Lblock;i++) for(int i=0;i<Lblock;i++)
{ {
auto wi_v = lhs_wi[i].View(); auto wi_v = lhs_wi[i].View(CpuRead);
auto left = conjugate(wi_v[ss]); auto left = conjugate(wi_v[ss]);
for(int j=0;j<Rblock;j++) for(int j=0;j<Rblock;j++)
{ {
SpinMatrix_v vv; SpinMatrix_v vv;
auto vj_v = rhs_vj[j].View(); auto vj_v = rhs_vj[j].View(CpuRead);
auto right = vj_v[ss]; auto right = vj_v[ss];
for(int s1=0;s1<Ns;s1++) for(int s1=0;s1<Ns;s1++)
@ -752,8 +752,8 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
for ( int m=0;m<Nem;m++) for ( int m=0;m<Nem;m++)
{ {
auto emB0_v = emB0[m].View(); auto emB0_v = emB0[m].View(CpuRead);
auto emB1_v = emB1[m].View(); auto emB1_v = emB1[m].View(CpuRead);
int idx = m+base; int idx = m+base;
auto b0 = emB0_v[ss]; auto b0 = emB0_v[ss];
auto b1 = emB1_v[ss]; auto b1 = emB1_v[ss];
@ -1014,12 +1014,12 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
for(int d_o=0;d_o<N_d;d_o+=d_unroll){ for(int d_o=0;d_o<N_d;d_o+=d_unroll){
for(int t=0;t<N_t;t++){ for(int t=0;t<N_t;t++){
for(int s=0;s<N_s;s++){ for(int s=0;s<N_s;s++){
auto vs_v = vs[s].View(); auto vs_v = vs[s].View(CpuRead);
auto tmp1 = vs_v[ss]; auto tmp1 = vs_v[ss];
vobj tmp2 = Zero(); vobj tmp2 = Zero();
vobj tmp3 = Zero(); vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){ for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
auto vd_v = vd[d].View(); auto vd_v = vd[d].View(CpuRead);
Scalar_v coeff = WW_sd(t,s,d); Scalar_v coeff = WW_sd(t,s,d);
tmp3 = conjugate(vd_v[ss]); tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3); mac(&tmp2, &coeff, &tmp3);
@ -1067,12 +1067,12 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{
for(int d_o=0;d_o<N_d;d_o+=d_unroll){ for(int d_o=0;d_o<N_d;d_o+=d_unroll){
for(int s=0;s<N_s;s++){ for(int s=0;s<N_s;s++){
auto vs_v = vs[s].View(); auto vs_v = vs[s].View(CpuRead);
auto tmp1 = vs_v[ss]; auto tmp1 = vs_v[ss];
vobj tmp2 = Zero(); vobj tmp2 = Zero();
vobj tmp3 = Zero(); vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){ for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
auto vd_v = vd[d].View(); auto vd_v = vd[d].View(CpuRead);
Scalar_v coeff = buf(s,d); Scalar_v coeff = buf(s,d);
tmp3 = conjugate(vd_v[ss]); tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3); mac(&tmp2, &coeff, &tmp3);
@ -1093,7 +1093,7 @@ inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
const vobj &rhs, const vobj &rhs,
const int Ns, const int ss) const int Ns, const int ss)
{ {
auto WWVV_v = WWVV.View(); auto WWVV_v = WWVV.View(CpuWrite);
for (int s1 = 0; s1 < Ns; s1++){ for (int s1 = 0; s1 < Ns; s1++){
for (int s2 = 0; s2 < Ns; s2++){ for (int s2 = 0; s2 < Ns; s2++){
WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0); WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
@ -1122,10 +1122,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
GridBase *grid = WWVV0.Grid(); GridBase *grid = WWVV0.Grid();
auto WWVV0_v = WWVV0.View(); auto WWVV0_v = WWVV0.View(CpuRead);
auto WWVV1_v = WWVV1.View(); auto WWVV1_v = WWVV1.View(CpuRead);
auto O_trtr_v= O_trtr.View(); auto O_trtr_v= O_trtr.View(CpuWrite);
auto O_fig8_v= O_fig8.View(); auto O_fig8_v= O_fig8.View(CpuWrite);
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{
typedef typename ComplexField::vector_object vobj; typedef typename ComplexField::vector_object vobj;
@ -1166,10 +1166,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
GridBase *grid = WWVV0.Grid(); GridBase *grid = WWVV0.Grid();
auto WWVV0_v = WWVV0.View(); auto WWVV0_v = WWVV0.View(CpuRead);
auto WWVV1_v = WWVV1.View(); auto WWVV1_v = WWVV1.View(CpuRead);
auto O_trtr_v= O_trtr.View(); auto O_trtr_v= O_trtr.View(CpuWrite);
auto O_fig8_v= O_fig8.View(); auto O_fig8_v= O_fig8.View(CpuWrite);
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{

View File

@ -273,10 +273,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
for (int ie=0; ie < 6 ; ie++) for (int ie=0; ie < 6 ; ie++)
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
auto vbaryon_corr= baryon_corr.View(); auto vbaryon_corr= baryon_corr.View(CpuWrite);
auto v1 = q1_left.View(); auto v1 = q1_left.View(CpuRead);
auto v2 = q2_left.View(); auto v2 = q2_left.View(CpuRead);
auto v3 = q3_left.View(); auto v3 = q3_left.View(CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), { // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{
@ -560,10 +560,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
{ {
GridBase *grid = qs_ti.Grid(); GridBase *grid = qs_ti.Grid();
auto vcorr= stn_corr.View(); auto vcorr= stn_corr.View(CpuWrite);
auto vq_loop = qq_loop.View(); auto vq_loop = qq_loop.View(CpuRead);
auto vd_tf = qd_tf.View(); auto vd_tf = qd_tf.View(CpuRead);
auto vs_ti = qs_ti.View(); auto vs_ti = qs_ti.View(CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), { // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{
@ -597,11 +597,11 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
{ {
GridBase *grid = qs_ti.Grid(); GridBase *grid = qs_ti.Grid();
auto vcorr= stn_corr.View(); auto vcorr= stn_corr.View(CpuWrite);
auto vq_ti = qq_ti.View(); auto vq_ti = qq_ti.View(CpuRead);
auto vq_tf = qq_tf.View(); auto vq_tf = qq_tf.View(CpuRead);
auto vd_tf = qd_tf.View(); auto vd_tf = qd_tf.View(CpuRead);
auto vs_ti = qs_ti.View(); auto vs_ti = qs_ti.View(CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), { // accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{

View File

@ -47,8 +47,8 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
accelerator_for( ss, x_v.size(),vobj::Nsimd(), { accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss))); auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
@ -63,9 +63,9 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
conformable(x,z); conformable(x,z);
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
// FIXME -- need a new class of accelerator_loop to implement this // FIXME -- need a new class of accelerator_loop to implement this
// //
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
conformable(x,z); conformable(x,z);
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
conformable(x,z); conformable(x,z);
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto y_v = y.View(); auto y_v = y.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -189,8 +189,8 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
conformable(x,z); conformable(x,z);
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -222,8 +222,8 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
static_assert(nbasis % 2 == 0, ""); static_assert(nbasis % 2 == 0, "");
int nb = nbasis / 2; int nb = nbasis / 2;
auto z_v = z.View(); auto z_v = z.View(AcceleratorWrite);
auto x_v = x.View(); auto x_v = x.View(AcceleratorRead);
accelerator_for(ss,grid->oSites(),CComplex::Nsimd(), accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
{ {
for(int n = 0; n < nb; ++n) { for(int n = 0; n < nb; ++n) {

View File

@ -222,9 +222,9 @@ public:
conformable(subgroup, Determinant); conformable(subgroup, Determinant);
int i0, i1; int i0, i1;
su2SubGroupIndex(i0, i1, su2_index); su2SubGroupIndex(i0, i1, su2_index);
auto subgroup_v = subgroup.View(); auto subgroup_v = subgroup.View(CpuWrite);
auto source_v = source.View(); auto source_v = source.View(CpuRead);
auto Determinant_v = Determinant.View(); auto Determinant_v = Determinant.View(CpuWrite);
thread_for(ss, grid->oSites(), { thread_for(ss, grid->oSites(), {
@ -257,8 +257,8 @@ public:
su2SubGroupIndex(i0, i1, su2_index); su2SubGroupIndex(i0, i1, su2_index);
dest = 1.0; // start out with identity dest = 1.0; // start out with identity
auto dest_v = dest.View(); auto dest_v = dest.View(CpuWrite);
auto subgroup_v = subgroup.View(); auto subgroup_v = subgroup.View(CpuRead);
thread_for(ss, grid->oSites(), thread_for(ss, grid->oSites(),
{ {
dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0); dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);

View File

@ -67,7 +67,7 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
{ {
int num=table.size(); int num=table.size();
std::pair<int,int> *table_v = & table[0]; std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
accelerator_forNB( i,num, vobj::Nsimd(), { accelerator_forNB( i,num, vobj::Nsimd(), {
typedef decltype(coalescedRead(buffer[0])) compressed_t; typedef decltype(coalescedRead(buffer[0])) compressed_t;
compressed_t tmp_c; compressed_t tmp_c;
@ -94,7 +94,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
int num=table.size()/2; int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0]; auto p0=&pointers[0][0];
auto p1=&pointers[1][0]; auto p1=&pointers[1][0];
auto tp=&table[0]; auto tp=&table[0];
@ -122,7 +122,7 @@ struct StencilEntry {
// Could pack to 8 + 4 + 4 = 128 bit and use // Could pack to 8 + 4 + 4 = 128 bit and use
template<class vobj,class cobj,class Parameters> template<class vobj,class cobj,class Parameters>
class CartesianStencilView { class CartesianStencilAccelerator {
public: public:
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector; typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
@ -130,14 +130,15 @@ class CartesianStencilView {
//////////////////////////////////////// ////////////////////////////////////////
// Basic Grid and stencil info // Basic Grid and stencil info
//////////////////////////////////////// ////////////////////////////////////////
int _checkerboard; int _checkerboard;
int _npoints; // Move to template param? int _npoints; // Move to template param?
int _osites;
StencilVector _directions; StencilVector _directions;
StencilVector _distances; StencilVector _distances;
StencilVector _comm_buf_size; StencilVector _comm_buf_size;
StencilVector _permute_type; StencilVector _permute_type;
StencilVector same_node; StencilVector same_node;
Coordinate _simd_layout; Coordinate _simd_layout;
Parameters parameters; Parameters parameters;
StencilEntry* _entries_p; StencilEntry* _entries_p;
cobj* u_recv_buf_p; cobj* u_recv_buf_p;
@ -175,13 +176,37 @@ class CartesianStencilView {
{ {
Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
} }
};
template<class vobj,class cobj,class Parameters>
class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters>
{
std::shared_ptr<MemViewDeleter> Deleter;
public:
//
CartesianStencilView (const CartesianStencilView &refer_to_me)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me), Deleter(refer_to_me.Deleter)
{ }
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode mode)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me), Deleter(new MemViewDeleter)
{
Deleter->cpu_ptr =(void *)this->_entries_p;
Deleter->mode = mode;
this->_entries_p =(StencilEntry *)
AllocationCache::ViewOpen(this->_entries_p,
this->_npoints*this->_osites*sizeof(StencilEntry),
mode,
AdviseDefault);
}
}; };
//////////////////////////////////////// ////////////////////////////////////////
// The Stencil Class itself // The Stencil Class itself
//////////////////////////////////////// ////////////////////////////////////////
template<class vobj,class cobj,class Parameters> template<class vobj,class cobj,class Parameters>
class CartesianStencil : public CartesianStencilView<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in. class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in.
public: public:
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
@ -226,8 +251,8 @@ public:
// Generalise as required later if needed // Generalise as required later if needed
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
View_type View(void) const { View_type View(ViewMode mode) const {
View_type accessor(*( (View_type *) this)); View_type accessor(*( (View_type *) this),mode);
return accessor; return accessor;
} }
@ -662,9 +687,9 @@ public:
_unified_buffer_size=0; _unified_buffer_size=0;
surface_list.resize(0); surface_list.resize(0);
int osites = _grid->oSites(); this->_osites = _grid->oSites();
_entries.resize(this->_npoints* osites); _entries.resize(this->_npoints* this->_osites);
this->_entries_p = &_entries[0]; this->_entries_p = &_entries[0];
for(int ii=0;ii<npoints;ii++){ for(int ii=0;ii<npoints;ii++){

View File

@ -96,7 +96,13 @@ void acceleratorInit(void);
#define accelerator __host__ __device__ #define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline #define accelerator_inline __host__ __device__ inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return threadIdx.z; } // CUDA specific accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return threadIdx.z;
#else
return 0;
#endif
} // CUDA specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \ { \
@ -178,7 +184,13 @@ extern cl::sycl::queue *theGridAccelerator;
#define accelerator #define accelerator
#define accelerator_inline strong_inline #define accelerator_inline strong_inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; } // SYCL specific accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2];
#else
return 0;
#endif
} // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
@ -224,7 +236,13 @@ NAMESPACE_BEGIN(Grid);
#define accelerator_inline __host__ __device__ inline #define accelerator_inline __host__ __device__ inline
/*These routines define mapping from thread grid to loop & vector lane indexing */ /*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_z; } // HIP specific accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return hipThreadIdx_z;
#else
return 0;
#endif
} // HIP specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \ { \