From 7860a50f70a6084dd281ca7e69aca917c47ddfa3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 May 2020 16:13:16 -0400 Subject: [PATCH] Make view specify where and drive data motion - first cut. This is a compile tiime option --enable-unified=yes/no --- Grid/GridStd.h | 1 + Grid/algorithms/CoarsenedMatrix.h | 42 ++-- Grid/algorithms/FFT.h | 3 +- Grid/algorithms/iterative/BiCGSTAB.h | 14 +- Grid/algorithms/iterative/ConjugateGradient.h | 6 +- .../iterative/ImplicitlyRestartedLanczos.h | 14 +- Grid/allocator/AlignedAllocator.cc | 2 +- Grid/allocator/AllocationCache.cc | 21 +- Grid/allocator/AllocationCache.h | 43 +++- Grid/allocator/MemoryCacheDeviceMem.cc | 157 +++++++++---- Grid/allocator/MemoryCacheShared.cc | 27 +-- Grid/cshift/Cshift_common.h | 18 +- Grid/lattice/Lattice.h | 1 + Grid/lattice/Lattice_ET.h | 8 +- Grid/lattice/Lattice_arith.h | 68 +++--- Grid/lattice/Lattice_base.h | 210 ++---------------- Grid/lattice/Lattice_comparison.h | 14 +- Grid/lattice/Lattice_coordinate.h | 20 +- Grid/lattice/Lattice_local.h | 16 +- Grid/lattice/Lattice_matrix_reduction.h | 14 +- Grid/lattice/Lattice_peekpoke.h | 24 +- Grid/lattice/Lattice_reality.h | 8 +- Grid/lattice/Lattice_reduction.h | 43 ++-- Grid/lattice/Lattice_rng.h | 2 +- Grid/lattice/Lattice_trace.h | 8 +- Grid/lattice/Lattice_transfer.h | 59 +++-- Grid/lattice/Lattice_transpose.h | 8 +- Grid/lattice/Lattice_unary.h | 16 +- Grid/qcd/action/fermion/GparityWilsonImpl.h | 18 +- Grid/qcd/action/fermion/WilsonCloverFermion.h | 24 +- Grid/qcd/action/fermion/WilsonImpl.h | 12 +- .../implementation/CayleyFermion5Dcache.h | 20 +- .../implementation/CayleyFermion5Dvec.h | 24 +- .../DomainWallEOFAFermionCache.h | 20 +- ...ImprovedStaggeredFermion5DImplementation.h | 32 +-- .../ImprovedStaggeredFermionImplementation.h | 40 ++-- .../implementation/MobiusEOFAFermionCache.h | 40 ++-- .../WilsonFermionImplementation.h | 22 +- .../WilsonKernelsImplementation.h | 46 ++-- Grid/qcd/action/gauge/GaugeImplTypes.h | 8 +- .../action/scalar/ScalarInteractionAction.h | 8 +- Grid/qcd/smearing/GaugeConfiguration.h | 20 +- Grid/qcd/utils/A2Autils.h | 50 ++--- Grid/qcd/utils/BaryonUtils.h | 26 +-- Grid/qcd/utils/LinalgUtils.h | 48 ++-- Grid/qcd/utils/SUn.h | 10 +- Grid/stencil/Stencil.h | 47 +++- Grid/threads/Accelerator.h | 24 +- 48 files changed, 688 insertions(+), 718 deletions(-) diff --git a/Grid/GridStd.h b/Grid/GridStd.h index 16cfcf50..ecb561ea 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -6,6 +6,7 @@ /////////////////// #include #include +#include #include #include #include diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 8e5c91a7..4493d740 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -186,10 +186,10 @@ public: hermop.HermOp(*Tn,y); - auto y_v = y.View(); - auto Tn_v = Tn->View(); - auto Tnp_v = Tnp->View(); - auto Tnm_v = Tnm->View(); + auto y_v = y.View(AcceleratorWrite); + auto Tn_v = Tn->View(AcceleratorWrite); + auto Tnp_v = Tnp->View(AcceleratorWrite); + auto Tnm_v = Tnm->View(AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); @@ -264,12 +264,12 @@ public: Stencil.HaloExchange(in,compressor); comms_usec += usecond(); - auto in_v = in.View(); - auto out_v = out.View(); + auto in_v = in.View(AcceleratorRead); + auto out_v = out.View(AcceleratorWrite); typedef LatticeView Aview; Vector AcceleratorViewContainer; - for(int p=0;p Aview; Vector AcceleratorViewContainer; - for(int p=0;poSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); // if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });} @@ -563,11 +563,11 @@ public: mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); { - auto tmp_ = tmp.View(); - auto evenmask_ = evenmask.View(); - auto oddmask_ = oddmask.View(); - auto Mphie_ = Mphie.View(); - auto Mphio_ = Mphio.View(); + auto tmp_ = tmp.View(AcceleratorWrite); + auto evenmask_ = evenmask.View(AcceleratorRead); + auto oddmask_ = oddmask.View(AcceleratorRead); + auto Mphie_ = Mphie.View(AcceleratorRead); + auto Mphio_ = Mphio.View(AcceleratorRead); accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); }); @@ -575,8 +575,8 @@ public: blockProject(SelfProj,tmp,Subspace.subspace); - auto SelfProj_ = SelfProj.View(); - auto A_self = A[self_stencil].View(); + auto SelfProj_ = SelfProj.View(AcceleratorRead); + auto A_self = A[self_stencil].View(AcceleratorWrite); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ for(int j=0;j pgbuf(&pencil_g); - auto pgbuf_v = pgbuf.View(); + auto pgbuf_v = pgbuf.View(CpuWrite); typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; diff --git a/Grid/algorithms/iterative/BiCGSTAB.h b/Grid/algorithms/iterative/BiCGSTAB.h index 3a7be1ef..04328a77 100644 --- a/Grid/algorithms/iterative/BiCGSTAB.h +++ b/Grid/algorithms/iterative/BiCGSTAB.h @@ -122,9 +122,9 @@ class BiCGSTAB : public OperatorFunction LinearCombTimer.Start(); bo = beta * omega; - auto p_v = p.View(); - auto r_v = r.View(); - auto v_v = v.View(); + auto p_v = p.View(AcceleratorWrite); + auto r_v = r.View(AcceleratorWrite); + auto v_v = v.View(AcceleratorWrite); accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); }); @@ -142,13 +142,13 @@ class BiCGSTAB : public OperatorFunction alpha = rho / Calpha.real(); LinearCombTimer.Start(); - auto h_v = h.View(); - auto psi_v = psi.View(); + auto h_v = h.View(AcceleratorWrite); + auto psi_v = psi.View(AcceleratorWrite); accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); }); - auto s_v = s.View(); + auto s_v = s.View(AcceleratorWrite); accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); }); @@ -166,7 +166,7 @@ class BiCGSTAB : public OperatorFunction omega = Comega.real() / norm2(t); LinearCombTimer.Start(); - auto t_v = t.View(); + auto t_v = t.View(AcceleratorWrite); accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index 3a2544b5..d40fee7b 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -140,9 +140,9 @@ public: b = cp / c; LinearCombTimer.Start(); - auto psi_v = psi.View(); - auto p_v = p.View(); - auto r_v = r.View(); + auto psi_v = psi.View(AcceleratorWrite); + auto p_v = p.View(AcceleratorWrite); + auto r_v = r.View(AcceleratorWrite); accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index 49190663..05ed8586 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -57,17 +57,17 @@ void basisOrthogonalize(std::vector &basis,Field &w,int k) template void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) { - typedef decltype(basis[0].View()) View; - auto tmp_v = basis[0].View(); + typedef decltype(basis[0].View(CpuWrite)) View; + auto tmp_v = basis[0].View(CpuWrite); Vector basis_v(basis.size(),tmp_v); View *basis_vp = &basis_v[0]; typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); for(int k=0;k > Bt(thread_max() * Nm); // Thread private thread_region { @@ -149,16 +149,16 @@ void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, i template void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) { - typedef decltype(basis[0].View()) View; + typedef decltype(basis[0].View(AcceleratorWrite)) View; typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); result.Checkerboard() = basis[0].Checkerboard(); - auto result_v=result.View(); + auto result_v=result.View(AcceleratorWrite); Vector basis_v(basis.size(),result_v); View * basis_vp = &basis_v[0]; for(int k=0;k Qt_jv(Nm); double * Qt_j = & Qt_jv[0]; diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index 18854c95..399f1939 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -12,7 +12,7 @@ bool MemoryProfiler::debug = false; #define SMALL_LIMIT (4096) #endif -#ifdef POINTER_CACHE +#ifdef ALLOCATION_CACHE int PointerCache::victim; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; diff --git a/Grid/allocator/AllocationCache.cc b/Grid/allocator/AllocationCache.cc index a7aeea80..dc32affd 100644 --- a/Grid/allocator/AllocationCache.cc +++ b/Grid/allocator/AllocationCache.cc @@ -22,8 +22,10 @@ void *AllocationCache::AcceleratorAllocate(size_t bytes) { void *ptr = (void *) Lookup(bytes,Acc); - if ( ptr == (void *) NULL ) + if ( ptr == (void *) NULL ) { ptr = (void *) acceleratorAllocDevice(bytes); + // std::cout <<"AcceleratorAllocate: allocated Accelerator pointer "<=0){ Evict(e); } + if(e>=0){ Discard(e); } // If present remove entry and free accelerator too. // Can we ever hit a free event with a view still in scope? @@ -90,13 +90,18 @@ void AllocationCache::Init(void) Ncache[AccSmall]=Nc; } } + std::cout << "MemoryManager::Init() SMALL "< -#ifndef GRID_UNIFIED +#ifndef GRID_UVM #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); -#define dprintf(...) +#define dprintf //////////////////////////////////////////////////////////// // For caching copies of data on device @@ -20,15 +20,12 @@ typedef struct { uint32_t cpuLock; } AcceleratorViewEntry; -#define Write (1) -#define Read (2) -#define WriteDiscard (3) ////////////////////////////////////////////////////////////////////// // Data tables for ViewCache ////////////////////////////////////////////////////////////////////// static AcceleratorViewEntry AccCache[NaccCacheMax]; static int AccCacheVictim; // Base for round robin search -static int NaccCache = 8; +static int NaccCache = 32; //////////////////////////////////// // Priority ordering for unlocked entries @@ -68,7 +65,7 @@ int AllocationCache::ViewVictim(void) if ( locks==0 ) { - if( s==Empty ) { prioEmpty = e; dprintf("Empty");} + if( s==Empty ) { prioEmpty = e; dprintf("Empty"); } if( t == EvictNext ) { if( s==CpuDirty ) { prioCpuDirtyEN = e; dprintf("CpuDirty Transient");} @@ -97,21 +94,42 @@ int AllocationCache::ViewVictim(void) if ( prioEmpty >= 0 ) victim = prioEmpty; /*Highest prio is winner*/ assert(victim >= 0); // Must succeed/ - dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim); + dprintf("AllocationCacheDeviceMem: Selected victim cache entry %d\n",victim); // advance victim pointer AccCacheVictim=(AccCacheVictim+1)%NaccCache; - dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache); + dprintf("AllocationCacheDeviceMem: victim pointer now %d / %d\n",AccCacheVictim,NaccCache); return victim; } ///////////////////////////////////////////////// // Accelerator cache motion ///////////////////////////////////////////////// + +void AllocationCache::Discard(int e) // remove from Accelerator, remove entry, without flush +{ + if(AccCache[e].state!=Empty){ + dprintf("AllocationCache: Discard(%d) %llx,%llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); + assert(AccCache[e].accLock==0); + assert(AccCache[e].cpuLock==0); + assert(AccCache[e].CpuPtr!=NULL); + if(AccCache[e].AccPtr) { + dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr); + AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes); + } + } + AccCache[e].AccPtr=NULL; + AccCache[e].CpuPtr=NULL; + AccCache[e].bytes=0; + AccCache[e].state=Empty; + AccCache[e].accLock=0; + AccCache[e].cpuLock=0; +} + void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerator, remove entry { if(AccCache[e].state!=Empty){ - dprintf("AllocationCache: Evict(%d) %llx,%llxn",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); + dprintf("AllocationCache: Evict(%d) %llx,%llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); assert(AccCache[e].accLock==0); assert(AccCache[e].cpuLock==0); if(AccCache[e].state==AccDirty) { @@ -119,7 +137,7 @@ void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerat } assert(AccCache[e].CpuPtr!=NULL); if(AccCache[e].AccPtr) { - dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr); + dprintf("AllocationCache: Free(%d) %llx\n",e,(uint64_t)AccCache[e].AccPtr); AcceleratorFree(AccCache[e].AccPtr,AccCache[e].bytes); } } @@ -132,7 +150,7 @@ void AllocationCache::Evict(int e) // Make CPU consistent, remove from Accelerat } void AllocationCache::Flush(int e)// Copy back from a dirty device state and mark consistent. Do not remove { - dprintf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); + // printf("AllocationCache: Flush(%d) %llx -> %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout); assert(AccCache[e].state==AccDirty); assert(AccCache[e].cpuLock==0); assert(AccCache[e].accLock==0); @@ -150,14 +168,50 @@ void AllocationCache::Clone(int e)// Copy from CPU, mark consistent. Allocate if if(AccCache[e].AccPtr==NULL){ AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes); } - dprintf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); + // printf("AllocationCache: Clone(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout); acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes); AccCache[e].state=Consistent; } + +void AllocationCache::CpuDiscard(int e)// Mark accelerator dirty without copy. Allocate if necessary +{ + assert(AccCache[e].state!=Empty); + assert(AccCache[e].cpuLock==0); + assert(AccCache[e].accLock==0); + assert(AccCache[e].CpuPtr!=NULL); + if(AccCache[e].AccPtr==NULL){ + AccCache[e].AccPtr=AcceleratorAllocate(AccCache[e].bytes); + } + // printf("AllocationCache: CpuDiscard(%d) %llx <- %llx\n",e,(uint64_t)AccCache[e].AccPtr,(uint64_t)AccCache[e].CpuPtr); fflush(stdout); + // acceleratorCopyToDevice(AccCache[e].CpuPtr,AccCache[e].AccPtr,AccCache[e].bytes); + AccCache[e].state=AccDirty; +} + ///////////////////////////////////////////////////////////////////////////////// // View management ///////////////////////////////////////////////////////////////////////////////// -void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) +void AllocationCache::ViewClose(void* Ptr,ViewMode mode) +{ + if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + AcceleratorViewClose(Ptr); + } else if( (mode==CpuRead)||(mode==CpuWrite)){ + CpuViewClose(Ptr); + } else { + assert(0); + } +} +void *AllocationCache::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) +{ + if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ + return AcceleratorViewOpen(CpuPtr,bytes,mode,hint); + } else if( (mode==CpuRead)||(mode==CpuWrite)){ + return CpuViewOpen(CpuPtr,bytes,mode,hint); + } else { + assert(0); + return nullptr; + } +} +void *AllocationCache::AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) { //////////////////////////////////////////////////////////////////////////// // Find if present, otherwise get or force an empty @@ -165,9 +219,11 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi int e=CpuViewLookup(CpuPtr); if(e==-1) { e = ViewVictim(); + dprintf("AcceleratorViewOpen Victim is %d\n",e); Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty } + assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert(AccCache[e].cpuLock==0); // Programming error if(AccCache[e].state!=Empty) { @@ -193,35 +249,50 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi AccCache[e].AccPtr = NULL; AccCache[e].bytes = bytes; AccCache[e].state = CpuDirty; // Cpu starts primary - Clone(e); - if(mode==Write) - AccCache[e].state = AccDirty; // Empty + AccWrite=> AccDirty - else + if(mode==AcceleratorWriteDiscard){ + CpuDiscard(e); + AccCache[e].state = AccDirty; // Empty + AcceleratorWrite=> AccDirty + } else if(mode==AcceleratorWrite){ + Clone(e); + AccCache[e].state = AccDirty; // Empty + AcceleratorWrite=> AccDirty + } else { + Clone(e); AccCache[e].state = Consistent; // Empty + AccRead => Consistent + } AccCache[e].accLock= 1; - } else if(AccCache[e].state&CpuDirty ){ - Clone(e); - if(mode==Write) - AccCache[e].state = AccDirty; // CpuDirty + AccWrite=> AccDirty - else + // printf("Copied Empy entry %d into device accLock %d\n",e,AccCache[e].accLock); + } else if(AccCache[e].state==CpuDirty ){ + if(mode==AcceleratorWriteDiscard) { + CpuDiscard(e); + AccCache[e].state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty + } else if(mode==AcceleratorWrite) { + Clone(e); + AccCache[e].state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty + } else { + Clone(e); AccCache[e].state = Consistent; // CpuDirty + AccRead => Consistent + } AccCache[e].accLock++; - } else if(AccCache[e].state&Consistent) { - if(mode==Write) - AccCache[e].state = AccDirty; // Consistent + AccWrite=> AccDirty + // printf("Copied CpuDirty entry %d into device accLock %d\n",e,AccCache[e].accLock); + } else if(AccCache[e].state==Consistent) { + if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) + AccCache[e].state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache[e].state = Consistent; // Consistent + AccRead => Consistent AccCache[e].accLock++; - } else if(AccCache[e].state&AccDirty) { - if(mode==Write) - AccCache[e].state = AccDirty; // AccDirty + AccWrite=> AccDirty + // printf("Consistent entry %d into device accLock %d\n",e,AccCache[e].accLock); + } else if(AccCache[e].state==AccDirty) { + if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) + AccCache[e].state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache[e].state = AccDirty; // AccDirty + AccRead => AccDirty AccCache[e].accLock++; + // printf("AccDirty entry %d into device accLock %d\n",e,AccCache[e].accLock); } else { assert(0); } + int transient =hint; AccCache[e].transient= transient? EvictNext : 0; return AccCache[e].AccPtr; @@ -241,12 +312,18 @@ void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transi //////////////////////////////////// // look up & decrement lock count //////////////////////////////////// -void AllocationCache::AccViewClose(void* AccPtr) +void AllocationCache::AcceleratorViewClose(void* AccPtr) { - int e=AccViewLookup(AccPtr); + int e=CpuViewLookup(AccPtr); + // printf("AccView close %d lock %d \n",e,AccCache[e].accLock); + if(e==-1) exit(0); + if(AccCache[e].cpuLock!=0) exit(0); + if(AccCache[e].accLock==0) exit(0); + /* assert(e!=-1); assert(AccCache[e].cpuLock==0); assert(AccCache[e].accLock>0); + */ AccCache[e].accLock--; } void AllocationCache::CpuViewClose(void* CpuPtr) @@ -257,7 +334,7 @@ void AllocationCache::CpuViewClose(void* CpuPtr) assert(AccCache[e].accLock==0); AccCache[e].cpuLock--; } -void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) +void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient) { //////////////////////////////////////////////////////////////////////////// // Find if present, otherwise get or force an empty @@ -265,9 +342,11 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi int e=CpuViewLookup(CpuPtr); if(e==-1) { e = ViewVictim(); + dprintf("CpuViewOpen Victim is %d\n",e); Evict(e); // Does copy back if necessary, frees accelerator pointer if not null, sets to empty } + assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache[e].accLock==0); // Programming error if(AccCache[e].state!=Empty) { @@ -288,7 +367,7 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi AccCache[e].cpuLock++; } else if(AccCache[e].state==Consistent) { assert(AccCache[e].AccPtr != NULL); - if(mode==Write) + if(mode==CpuWrite) AccCache[e].state = CpuDirty; // Consistent +CpuWrite => CpuDirty else AccCache[e].state = Consistent; // Consistent +CpuRead => Consistent @@ -296,7 +375,7 @@ void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transi } else if(AccCache[e].state==AccDirty) { assert(AccCache[e].AccPtr != NULL); Flush(e); - if(mode==Write) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush + if(mode==CpuWrite) AccCache[e].state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush else AccCache[e].state = Consistent; // AccDirty +CpuRead => Consistent, Flush AccCache[e].cpuLock++; } else { @@ -321,16 +400,6 @@ int AllocationCache::CpuViewLookup(void *CpuPtr) } return -1; } -int AllocationCache::AccViewLookup(void *AccPtr) -{ - assert(AccPtr!=NULL); - for(int e=0;e -#ifdef GRID_UNIFIED +#ifdef GRID_UVM #warning "Grid is assuming unified virtual memory address space" NAMESPACE_BEGIN(Grid); @@ -7,21 +7,22 @@ NAMESPACE_BEGIN(Grid); // View management is 1:1 address space mapping ///////////////////////////////////////////////////////////////////////////////// -void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; } -void *AllocationCache::AccViewOpen(void* CpuPtr,size_t bytes,int mode,int transient) { return CpuPtr; } -void AllocationCache::AccViewClose(void* AccPtr){} -void AllocationCache::CpuViewClose(void* CpuPtr){} - +void AllocationCache::AcceleratorViewClose(void* AccPtr){}; +void *AllocationCache::AcceleratorViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; } +void AllocationCache::CpuViewClose(void* Ptr){}; +void *AllocationCache::CpuViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; } +int AllocationCache::CpuViewLookup(void *CpuPtr){ return 0;} ///////////////////////////////////// // Dummy stubs ///////////////////////////////////// -int AllocationCache::ViewVictim(void) { assert(0); return 0;} -void AllocationCache::Evict(int e) { assert(0);} -void AllocationCache::Flush(int e) { assert(0);} -void AllocationCache::Clone(int e) { assert(0);} - -int AllocationCache::CpuViewLookup(void *CpuPtr){assert(0); return 0;} -int AllocationCache::AccViewLookup(void *AccPtr){assert(0); return 0;} +void AllocationCache::CpuDiscard(int e) { return;} +void AllocationCache::Discard(int e) { return;} +void AllocationCache::Evict(int e) { return; } +void AllocationCache::Flush(int e) { assert(0);} +void AllocationCache::Clone(int e) { assert(0);} +int AllocationCache::ViewVictim(void) { assert(0); return 0;} +void AllocationCache::ViewClose(void* AccPtr,ViewMode mode){}; +void *AllocationCache::ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){return CpuPtr;}; NAMESPACE_END(Grid); #endif diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index fe9afc62..1c99e797 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -52,7 +52,6 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen int stride=rhs.Grid()->_slice_stride[dimension]; - auto rhs_v = rhs.View(); if ( cbmask == 0x3 ) { for(int n=0;n &rhs,commVector &buffer,int dimen } } } + auto rhs_v = rhs.View(AcceleratorRead); auto buffer_p = & buffer[0]; auto table = &Cshift_table[0]; accelerator_for(i,ent,1,{ @@ -100,7 +100,7 @@ Gather_plane_extract(const Lattice &rhs, int e2=rhs.Grid()->_slice_block[dimension]; int n1=rhs.Grid()->_slice_stride[dimension]; - auto rhs_v = rhs.View(); + auto rhs_v = rhs.View(AcceleratorRead); if ( cbmask ==0x3){ accelerator_for2d(n,e1,b,e2,1,{ int o = n*n1; @@ -179,7 +179,7 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - auto rhs_v = rhs.View(); + auto rhs_v = rhs.View(AcceleratorWrite); accelerator_for2d(n,e1,b,e2,1,{ int o = n*rhs.Grid()->_slice_stride[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension]; @@ -216,7 +216,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA // Test_cshift_red_black code. // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<_slice_stride[dimension]; @@ -272,8 +272,8 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs } } - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(AcceleratorRead); + auto lhs_v = lhs.View(AcceleratorWrite); auto table = &Cshift_table[0]; accelerator_for(i,ent,1,{ lhs_v[table[i].first]=rhs_v[table[i].second]; @@ -315,8 +315,8 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice *************************************************************************************/ /* END LEGAL */ #pragma once +#include #include #include #include diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index b8abd199..b4f196b6 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -91,12 +91,16 @@ const lobj & eval(const uint64_t ss, const LatticeExprView &arg) { return arg[ss]; } + +// What needs this? +#if 1 template accelerator_inline const lobj & eval(const uint64_t ss, const Lattice &arg) { auto view = arg.View(); return view[ss]; } +#endif /////////////////////////////////////////////////// // handle nodes in syntax tree- eval one operand @@ -206,7 +210,7 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression::value, T1>::type * = nullptr> inline void ExpressionViewOpen(T1 &lat) // Lattice leaf { - lat.AcceleratorViewOpen(); + lat.ViewOpen(AcceleratorRead); } template ::value, T1>::type * = nullptr> inline void ExpressionViewOpen(T1 ¬lat) {} @@ -237,7 +241,7 @@ inline void ExpressionViewOpen(LatticeTrinaryExpression &expr) template ::value, T1>::type * = nullptr> inline void ExpressionViewClose( T1 &lat) // Lattice leaf { - lat.AcceleratorViewClose(); + lat.ViewClose(); } template ::value, T1>::type * = nullptr> inline void ExpressionViewClose(T1 ¬lat) {} diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 3543d6aa..b1252952 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -36,9 +36,9 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); + auto rhs_v = rhs.View(AcceleratorRead); conformable(ret,rhs); conformable(lhs,rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ @@ -55,9 +55,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); + auto rhs_v = rhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -72,9 +72,9 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); + auto rhs_v = rhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -88,9 +88,9 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); + auto rhs_v = rhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -107,8 +107,8 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs_v(ss),&rhs); @@ -120,8 +120,8 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -134,8 +134,8 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -147,8 +147,8 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -164,8 +164,8 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto rhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -178,8 +178,8 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto rhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -192,8 +192,8 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto rhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -205,8 +205,8 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto rhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -220,9 +220,9 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+y_v(ss); coalescedWrite(ret_v[ss],tmp); @@ -233,9 +233,9 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 76622275..17f84d44 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -28,6 +28,7 @@ See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + #pragma once #define STREAMING_STORES @@ -36,181 +37,6 @@ NAMESPACE_BEGIN(Grid); extern int GridCshiftPermuteMap[4][16]; -/////////////////////////////////////////////////////////////////// -// Base class which can be used by traits to pick up behaviour -/////////////////////////////////////////////////////////////////// -class LatticeBase {}; - -///////////////////////////////////////////////////////////////////////////////////////// -// Conformable checks; same instance of Grid required -///////////////////////////////////////////////////////////////////////////////////////// -void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) -{ - assert(lhs == rhs); -} - -//////////////////////////////////////////////////////////////////////////// -// Minimal base class containing only data valid to access from accelerator -// _odata will be a managed pointer in CUDA -//////////////////////////////////////////////////////////////////////////// -// Force access to lattice through a view object. -// prevents writing of code that will not offload to GPU, but perhaps annoyingly -// strict since host could could in principle direct access through the lattice object -// Need to decide programming model. -#define LATTICE_VIEW_STRICT -template class LatticeAccelerator : public LatticeBase -{ -protected: - GridBase *_grid; - int checkerboard; - vobj *_odata; // A managed pointer - uint64_t _odata_size; -public: - accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { }; - accelerator_inline uint64_t oSites(void) const { return _odata_size; }; - accelerator_inline int Checkerboard(void) const { return checkerboard; }; - accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view - accelerator_inline void Conformable(GridBase * &grid) const - { - if (grid) conformable(grid, _grid); - else grid = _grid; - }; -}; - -///////////////////////////////////////////////////////////////////////////////////////// -// A View class which provides accessor to the data. -// This will be safe to call from accelerator_for and is trivially copy constructible -// The copy constructor for this will need to be used by device lambda functions -///////////////////////////////////////////////////////////////////////////////////////// -template -class LatticeExprView : public LatticeAccelerator -{ -public: - // Rvalue -#ifdef GRID_SIMT - accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); } -#else - accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } -#endif - - accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; - accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; - - accelerator_inline uint64_t begin(void) const { return 0;}; - accelerator_inline uint64_t end(void) const { return this->_odata_size; }; - accelerator_inline uint64_t size(void) const { return this->_odata_size; }; - - // Non accelerator functions - LatticeExprView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me){} - ~LatticeExprView(){} - - void AcceleratorViewOpen(void) - { // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base - void *cpu_ptr=this->_odata; - // std::cout << "AccViewOpen "<_odata <_odata=(vobj *)AllocationCache::AccViewOpen(this->_odata,this->_odata_size*sizeof(vobj),1,0); - } - void AcceleratorViewClose(void) - { // Inform the manager - // std::cout << "View Close"<_odata<_odata); - } - void CpuViewOpen(void) - { // Translate the pointer - void *cpu_ptr=this->_odata; - // std::cout << "CpuViewOpen "<_odata <_odata=(vobj *)AllocationCache::CpuViewOpen(cpu_ptr,this->_odata_size*sizeof(vobj),1,0); - } - void CpuViewClose(void) - { // Inform the manager - // std::cout << "CpuViewClose"<_odata<_odata); - } - -}; -// UserView constructor,destructor updates view manager -// Non-copyable object??? Second base with copy/= deleted? -template -class LatticeView : public LatticeExprView -{ -public: - // Rvalue - /* -#ifdef GRID_SIMT - accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); } -#else - accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } -#endif - - accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; - accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; - - accelerator_inline uint64_t begin(void) const { return 0;}; - accelerator_inline uint64_t end(void) const { return this->_odata_size; }; - accelerator_inline uint64_t size(void) const { return this->_odata_size; }; - */ - LatticeView(const LatticeAccelerator &refer_to_me) : LatticeExprView (refer_to_me) - { - this->AcceleratorViewOpen(); - } - ~LatticeView(){ - this->AcceleratorViewClose(); - } -}; - - -///////////////////////////////////////////////////////////////////////////////////////// -// Lattice expression types used by ET to assemble the AST -// -// Need to be able to detect code paths according to the whether a lattice object or not -// so introduce some trait type things -///////////////////////////////////////////////////////////////////////////////////////// - -class LatticeExpressionBase {}; - -template using is_lattice = std::is_base_of; -template using is_lattice_expr = std::is_base_of; - -template struct ViewMapBase { typedef T Type; }; -template struct ViewMapBase { typedef LatticeExprView Type; }; -template using ViewMap = ViewMapBase::value >; - -template -class LatticeUnaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - Op op; - T1 arg1; - LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {}; -}; - -template -class LatticeBinaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - typedef typename ViewMap<_T2>::Type T2; - Op op; - T1 arg1; - T2 arg2; - LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {}; -}; - -template -class LatticeTrinaryExpression : public LatticeExpressionBase -{ -public: - typedef typename ViewMap<_T1>::Type T1; - typedef typename ViewMap<_T2>::Type T2; - typedef typename ViewMap<_T3>::Type T3; - Op op; - T1 arg1; - T2 arg2; - T3 arg3; - LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {}; -}; - ///////////////////////////////////////////////////////////////////////////////////////// // The real lattice class, with normal copy and assignment semantics. // This contains extra (host resident) grid pointer data that may be accessed by host code @@ -253,14 +79,20 @@ private: } } public: + ///////////////////////////////////////////////////////////////////////////////// + // Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents + ///////////////////////////////////////////////////////////////////////////////// + void SetViewMode(ViewMode mode) { + LatticeView accessor(*( (LatticeAccelerator *) this),mode); + } ///////////////////////////////////////////////////////////////////////////////// // Return a view object that may be dereferenced in site loops. // The view is trivially copy constructible and may be copied to an accelerator device // in device lambdas ///////////////////////////////////////////////////////////////////////////////// - LatticeView View (void) const + LatticeView View (ViewMode mode) const { - LatticeView accessor(*( (LatticeAccelerator *) this)); + LatticeView accessor(*( (LatticeAccelerator *) this),mode); return accessor; } @@ -286,7 +118,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); @@ -308,7 +140,7 @@ public: auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); @@ -329,7 +161,7 @@ public: this->checkerboard=cb; auto exprCopy = expr; ExpressionViewOpen(exprCopy); - auto me = View(); + auto me = View(AcceleratorWriteDiscard); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); @@ -385,9 +217,9 @@ public: } template inline Lattice & operator = (const sobj & r){ - auto me = View(); - thread_for(ss,me.size(),{ - me[ss] = r; + auto me = View(AcceleratorWriteDiscard); + accelerator_for(ss,me.size(),1,{ + me[ss]= r; }); return *this; } @@ -398,11 +230,12 @@ public: /////////////////////////////////////////// // user defined constructor /////////////////////////////////////////// - Lattice(GridBase *grid) { + Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { this->_grid = grid; resize(this->_grid->oSites()); assert((((uint64_t)&this->_odata[0])&0xF) ==0); this->checkerboard=0; + SetViewMode(mode); } // virtual ~Lattice(void) = default; @@ -418,7 +251,6 @@ public: // copy constructor /////////////////////////////////////////// Lattice(const Lattice& r){ - // std::cout << "Lattice constructor(const Lattice &) "<_grid = r.Grid(); resize(this->_grid->oSites()); *this = r; @@ -441,8 +273,8 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(); - auto him= r.View(); + auto me = View(AcceleratorWriteDiscard); + auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); @@ -455,8 +287,8 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(); - auto him= r.View(); + auto me = View(AcceleratorWriteDiscard); + auto him= r.View(AcceleratorRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); diff --git a/Grid/lattice/Lattice_comparison.h b/Grid/lattice/Lattice_comparison.h index bbed2ef5..17a61750 100644 --- a/Grid/lattice/Lattice_comparison.h +++ b/Grid/lattice/Lattice_comparison.h @@ -78,9 +78,9 @@ template inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + auto lhs_v = lhs.View(CpuRead); + auto rhs_v = rhs.View(CpuRead); + auto ret_v = ret.View(CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); }); @@ -93,8 +93,8 @@ template inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) { Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); + auto lhs_v = lhs.View(CpuRead); + auto ret_v = ret.View(CpuWrite); thread_for( ss, lhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs); }); @@ -107,8 +107,8 @@ template inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + auto rhs_v = rhs.View(CpuRead); + auto ret_v = ret.View(CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs,rhs_v[ss]); }); diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h index a1abe58d..b8e73b25 100644 --- a/Grid/lattice/Lattice_coordinate.h +++ b/Grid/lattice/Lattice_coordinate.h @@ -37,7 +37,7 @@ template inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - auto l_v = l.View(); + auto l_v = l.View(CpuWrite); thread_for( o, grid->oSites(), { vector_type vI; Coordinate gcoor; @@ -51,23 +51,5 @@ template inline void LatticeCoordinate(Lattice &l,int mu) }); }; -// LatticeCoordinate(); -// FIXME for debug; deprecate this; made obscelete by -template void lex_sites(Lattice &l){ - auto l_v = l.View(); - Real *v_ptr = (Real *)&l_v[0]; - size_t o_len = l.Grid()->oSites(); - size_t v_len = sizeof(vobj)/sizeof(vRealF); - size_t vec_len = vRealF::Nsimd(); - - for(int i=0;i inline auto localNorm2 (const Lattice &rhs)-> Lattice { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + auto rhs_v = rhs.View(AcceleratorRead); + auto ret_v = ret.View(AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); }); @@ -56,9 +56,9 @@ template inline auto localInnerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + auto lhs_v = lhs.View(AcceleratorRead); + auto rhs_v = rhs.View(AcceleratorRead); + auto ret_v = ret.View(AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); }); @@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Latt typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(rr())) srr; Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); - auto ret_v = ret.View(); + auto lhs_v = lhs.View(AcceleratorRead); + auto rhs_v = rhs.View(AcceleratorRead); + auto ret_v = ret.View(AcceleratorWrite); accelerator_for(ss,rhs_v.size(),1,{ // FIXME had issues with scalar version of outer // Use vector [] operator and don't read coalesce this loop diff --git a/Grid/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h index 0980ad8a..88de5210 100644 --- a/Grid/lattice/Lattice_matrix_reduction.h +++ b/Grid/lattice/Lattice_matrix_reduction.h @@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(); - auto Y_v = Y.View(); - auto R_v = R.View(); + auto X_v = X.View(CpuRead); + auto Y_v = Y.View(CpuRead); + auto R_v = R.View(CpuWrite); thread_region { std::vector s_x(Nblock); @@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(); - auto R_v = R.View(); + auto X_v = X.View(CpuRead); + auto R_v = R.View(CpuWrite); thread_region { @@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice int ostride=FullGrid->_ostride[Orthog]; typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto lhs_v = lhs.View(CpuRead); + auto rhs_v = rhs.View(CpuRead); thread_region { std::vector Left(Nblock); std::vector Right(Nblock); diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index 8f649bd7..af98c07b 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -46,8 +46,8 @@ auto PeekIndex(const Lattice &lhs,int i) -> Lattice(vobj(),i))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(CpuWrite); + auto lhs_v = lhs.View(CpuRead); thread_for( ss, lhs_v.size(), { ret_v[ss] = peekIndex(lhs_v[ss],i); }); @@ -58,8 +58,8 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice(vobj(),i,j))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(CpuWrite); + auto lhs_v = lhs.View(CpuRead); thread_for( ss, lhs_v.size(), { ret_v[ss] = peekIndex(lhs_v[ss],i,j); }); @@ -72,8 +72,8 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice void PokeIndex(Lattice &lhs,const Lattice(vobj(),0))> & rhs,int i) { - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(CpuRead); + auto lhs_v = lhs.View(CpuWrite); thread_for( ss, lhs_v.size(), { pokeIndex(lhs_v[ss],rhs_v[ss],i); }); @@ -81,8 +81,8 @@ void PokeIndex(Lattice &lhs,const Lattice(vobj() template void PokeIndex(Lattice &lhs,const Lattice(vobj(),0,0))> & rhs,int i,int j) { - auto rhs_v = rhs.View(); - auto lhs_v = lhs.View(); + auto rhs_v = rhs.View(CpuRead); + auto lhs_v = lhs.View(CpuWrite); thread_for( ss, lhs_v.size(), { pokeIndex(lhs_v[ss],rhs_v[ss],i,j); }); @@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ // extract-modify-merge cycle is easiest way and this is not perf critical ExtractBuffer buf(Nsimd); - auto l_v = l.View(); + auto l_v = l.View(CpuWrite); if ( rank == grid->ThisRank() ) { extract(l_v[odx],buf); buf[idx] = s; @@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ grid->GlobalCoorToRankIndex(rank,odx,idx,site); ExtractBuffer buf(Nsimd); - auto l_v = l.View(); + auto l_v = l.View(CpuWrite); extract(l_v[odx],buf); s = buf[idx]; @@ -173,7 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(); + auto l_v = l.View(CpuRead); scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * pt = (scalar_type *)&s; @@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(); + auto l_v = l.View(CpuWrite); scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w inline Lattice adj(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); + auto lhs_v = lhs.View(AcceleratorRead); + auto ret_v = ret.View(AcceleratorWrite); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], adj(lhs_v(ss))); }); @@ -50,8 +50,8 @@ template inline Lattice adj(const Lattice &lhs){ template inline Lattice conjugate(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(); - auto ret_v = ret.View(); + auto lhs_v = lhs.View(AcceleratorRead); + auto ret_v = ret.View(AcceleratorWrite); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 997affe8..99d799b6 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -76,7 +76,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) template inline typename vobj::scalar_object sum(const Lattice &arg) { - auto arg_v = arg.View(); + auto arg_v = arg.View(AcceleratorRead); Integer osites = arg.Grid()->oSites(); auto ssum= sum(&arg_v[0],osites); arg.Grid()->GlobalSum(ssum); @@ -102,8 +102,8 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ GridBase *grid = left.Grid(); // Might make all code paths go this way. - auto left_v = left.View(); - auto right_v=right.View(); + auto left_v = left.View(AcceleratorRead); + auto right_v=right.View(AcceleratorRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -167,9 +167,9 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt GridBase *grid = x.Grid(); - auto x_v=x.View(); - auto y_v=y.View(); - auto z_v=z.View(); + auto x_v=x.View(AcceleratorRead); + auto y_v=y.View(AcceleratorRead); + auto z_v=z.View(AcceleratorWrite); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -271,7 +271,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction - auto Data_v=Data.View(); + auto Data_v=Data.View(CpuRead); thread_for( r,rd, { int so=r*grid->_ostride[orthogdim]; // base offset for start of plane for(int n=0;n & result, const Latti int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - auto lhv=lhs.View(); - auto rhv=rhs.View(); + auto lhv=lhs.View(CpuRead); + auto rhv=rhs.View(CpuRead); thread_for( r,rd,{ int so=r*grid->_ostride[orthogdim]; // base offset for start of plane @@ -457,14 +457,12 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice tensor_reduced at; at=av; - auto Rv=R.View(); - auto Xv=X.View(); - auto Yv=Y.View(); - thread_for_collapse(2, n, e1, { - for(int b=0;b &R,Eigen::MatrixXcd &aa,const Lattice int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v=X.View(); - auto Y_v=Y.View(); - auto R_v=R.View(); + auto X_v=X.View(CpuRead); + auto Y_v=Y.View(CpuRead); + auto R_v=R.View(CpuWrite); thread_region { Vector s_x(Nblock); @@ -564,13 +562,14 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< // int nl=1; //FIXME package in a convenient iterator + // thread_for2d_in_region //Should loop over a plane orthogonal to direction "Orthog" int stride=FullGrid->_slice_stride[Orthog]; int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto R_v = R.View(); - auto X_v = X.View(); + auto R_v = R.View(CpuWrite); + auto X_v = X.View(CpuRead); thread_region { std::vector s_x(Nblock); @@ -628,8 +627,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v=lhs.View(); - auto rhs_v=rhs.View(); + auto lhs_v=lhs.View(CpuRead); + auto rhs_v=rhs.View(CpuRead); thread_region { std::vector Left(Nblock); diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index 04b74873..e5da8d35 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -375,7 +375,7 @@ public: int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int words = sizeof(scalar_object) / sizeof(scalar_type); - auto l_v = l.View(); + auto l_v = l.View(CpuWrite); thread_for( ss, osites, { ExtractBuffer buf(Nsimd); for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times diff --git a/Grid/lattice/Lattice_trace.h b/Grid/lattice/Lattice_trace.h index 93444e0c..8d1f85bd 100644 --- a/Grid/lattice/Lattice_trace.h +++ b/Grid/lattice/Lattice_trace.h @@ -41,8 +41,8 @@ template inline auto trace(const Lattice &lhs) -> Lattice { Lattice ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], trace(lhs_v(ss))); }); @@ -56,8 +56,8 @@ template inline auto TraceIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], traceIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c80e7db2..9e98d111 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -49,8 +49,8 @@ inline void subdivides(GridBase *coarse,GridBase *fine) template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ half.Checkerboard() = cb; - auto half_v = half.View(); - auto full_v = full.View(); + auto half_v = half.View(CpuWrite); + auto full_v = full.View(CpuRead); thread_for(ss, full.Grid()->oSites(),{ int cbos; Coordinate coor; @@ -65,8 +65,8 @@ template inline void pickCheckerboard(int cb,Lattice &half,con } template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.Checkerboard(); - auto half_v = half.View(); - auto full_v = full.View(); + auto half_v = half.View(CpuRead); + auto full_v = full.View(CpuWrite); thread_for(ss,full.Grid()->oSites(),{ Coordinate coor; @@ -92,9 +92,8 @@ inline void blockProject(Lattice > &coarseData, Lattice ip(coarse); - // auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); - auto ip_ = ip.View(); + auto coarseData_ = coarseData.View(AcceleratorWrite); + auto ip_ = ip.View(AcceleratorWrite); for(int v=0;voSites(), vobj::Nsimd(), { @@ -102,7 +101,7 @@ inline void blockProject(Lattice > &coarseData, }); } } - +#if 0 template inline void blockProject1(Lattice > &coarseData, const Lattice &fineData, @@ -132,8 +131,8 @@ inline void blockProject1(Lattice > &coarseData, coarseData=Zero(); - auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); + auto fineData_ = fineData.View(AcceleratorRead); + auto coarseData_ = coarseData.View(AcceleratorWrite); //////////////////////////////////////////////////////////////////////////////////////////////////////// // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. // Otherwise do fine inner product per site, and make the update atomic @@ -142,7 +141,7 @@ inline void blockProject1(Lattice > &coarseData, auto sc=sci/nbasis; auto i=sci%nbasis; - auto Basis_ = Basis[i].View(); + auto Basis_ = Basis[i].View(AcceleratorRead); Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate @@ -165,6 +164,7 @@ inline void blockProject1(Lattice > &coarseData, }); return; } +#endif template inline void blockZAXPY(Lattice &fineZ, @@ -191,10 +191,10 @@ inline void blockZAXPY(Lattice &fineZ, assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } - auto fineZ_ = fineZ.View(); - auto fineX_ = fineX.View(); - auto fineY_ = fineY.View(); - auto coarseA_= coarseA.View(); + auto fineZ_ = fineZ.View(AcceleratorWrite); + auto fineX_ = fineX.View(AcceleratorRead); + auto fineY_ = fineY.View(AcceleratorRead); + auto coarseA_= coarseA.View(AcceleratorRead); accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { @@ -227,11 +227,10 @@ inline void blockInnerProduct(Lattice &CoarseInner, Lattice coarse_inner(coarse); // Precision promotion? - auto CoarseInner_ = CoarseInner.View(); - auto coarse_inner_ = coarse_inner.View(); - fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); + auto CoarseInner_ = CoarseInner.View(AcceleratorWrite); + auto coarse_inner_ = coarse_inner.View(AcceleratorRead); accelerator_for(ss, coarse->oSites(), 1, { CoarseInner_[ss] = coarse_inner_[ss]; }); @@ -266,8 +265,8 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) // Turn this around to loop threaded over sc and interior loop // over sf would thread better - auto coarseData_ = coarseData.View(); - auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.View(AcceleratorWrite); + auto fineData_ = fineData.View(AcceleratorRead); accelerator_for(sc,coarse->oSites(),1,{ @@ -360,8 +359,8 @@ inline void blockPromote(const Lattice > &coarseData, for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } - auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); + auto fineData_ = fineData.View(AcceleratorWrite); + auto coarseData_ = coarseData.View(AcceleratorRead); // Loop with a cache friendly loop ordering accelerator_for(sf,fine->oSites(),1,{ @@ -374,7 +373,7 @@ inline void blockPromote(const Lattice > &coarseData, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); for(int i=0;i > &coarseData, for(int i=0;i > ip = PeekIndex<0>(coarseData,i); Lattice cip(coarse); - auto cip_ = cip.View(); - auto ip_ = ip.View(); + auto cip_ = cip.View(AcceleratorWrite); + auto ip_ = ip.View(AcceleratorRead); accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ coalescedWrite(cip_[sc], ip_(sc)()); }); @@ -470,8 +469,8 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; - auto t_v = To.View(); - auto f_v = From.View(); + auto t_v = To.View(AcceleratorWrite); + auto f_v = From.View(AcceleratorRead); accelerator_for(idx,Fg->lSites(),1,{ sobj s; Coordinate Fcoor(nd); @@ -718,7 +717,7 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) } //loop over outer index - auto in_v = in.View(); + auto in_v = in.View(CpuRead); thread_for(in_oidx,in_grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray out_ptrs(in_nsimd); @@ -811,7 +810,7 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) icoor[lane].resize(ndim); grid->iCoorFromIindex(icoor[lane],lane); } - auto out_v = out.View(); + auto out_v = out.View(CpuWrite); thread_for(oidx, grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray ptrs(nsimd); @@ -914,7 +913,7 @@ void precisionChange(Lattice &out, const Lattice &in) std::vector in_slex_conv(in_grid->lSites()); unvectorizeToLexOrdArray(in_slex_conv, in); - auto out_v = out.View(); + auto out_v = out.View(CpuWrite); thread_for(out_oidx,out_grid->oSites(),{ Coordinate out_ocoor(ndim); out_grid->oCoorFromOindex(out_ocoor, out_oidx); diff --git a/Grid/lattice/Lattice_transpose.h b/Grid/lattice/Lattice_transpose.h index 9b0b3483..c17a808b 100644 --- a/Grid/lattice/Lattice_transpose.h +++ b/Grid/lattice/Lattice_transpose.h @@ -41,8 +41,8 @@ NAMESPACE_BEGIN(Grid); template inline Lattice transpose(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); }); @@ -56,8 +56,8 @@ template inline auto TransposeIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.View(AcceleratorWrite); + auto lhs_v = lhs.View(AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss] , transposeIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_unary.h b/Grid/lattice/Lattice_unary.h index 591afe72..10aa7472 100644 --- a/Grid/lattice/Lattice_unary.h +++ b/Grid/lattice/Lattice_unary.h @@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid); template Lattice pow(const Lattice &rhs_i,RealD y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + auto rhs = rhs_i.View(AcceleratorRead); + auto ret = ret_i.View(AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),1,{ ret[ss]=pow(rhs[ss],y); @@ -45,8 +45,8 @@ template Lattice pow(const Lattice &rhs_i,RealD y){ } template Lattice mod(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + auto rhs = rhs_i.View(AcceleratorRead); + auto ret = ret_i.View(AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],mod(rhs(ss),y)); @@ -56,8 +56,8 @@ template Lattice mod(const Lattice &rhs_i,Integer y){ template Lattice div(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto ret = ret_i.View(); - auto rhs = rhs_i.View(); + auto ret = ret_i.View(AcceleratorWrite); + auto rhs = rhs_i.View(AcceleratorRead); ret.Checkerboard() = rhs_i.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],div(rhs(ss),y)); @@ -67,8 +67,8 @@ template Lattice div(const Lattice &rhs_i,Integer y){ template Lattice expMat(const Lattice &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(); - auto ret = ret_i.View(); + auto rhs = rhs_i.View(AcceleratorRead); + auto ret = ret_i.View(AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 47d1a861..a8ae90ec 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -233,10 +233,10 @@ public: Uconj = where(coor==neglink,-Uconj,Uconj); } - auto U_v = U.View(); - auto Uds_v = Uds.View(); - auto Uconj_v = Uconj.View(); - auto Utmp_v= Utmp.View(); + auto U_v = U.View(CpuRead); + auto Uds_v = Uds.View(CpuWrite); + auto Uconj_v = Uconj.View(CpuRead); + auto Utmp_v= Utmp.View(CpuWrite); thread_foreach(ss,U_v,{ Uds_v[ss](0)(mu) = U_v[ss](); Uds_v[ss](1)(mu) = Uconj_v[ss](); @@ -272,8 +272,8 @@ public: GaugeLinkField link(mat.Grid()); // use lorentz for flavour as hack. auto tmp = TraceIndex(outerProduct(Btilde, A)); - auto link_v = link.View(); - auto tmp_v = tmp.View(); + auto link_v = link.View(CpuWrite); + auto tmp_v = tmp.View(CpuRead); thread_foreach(ss,tmp_v,{ link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); }); @@ -306,9 +306,9 @@ public: GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(); - auto Atilde_v = Atilde.View(); - auto Btilde_v = Btilde.View(); + auto tmp_v = tmp.View(CpuWrite); + auto Atilde_v = Atilde.View(CpuRead); + auto Btilde_v = Btilde.View(CpuRead); thread_for(ss,tmp.Grid()->oSites(),{ for (int s = 0; s < Ls; s++) { int sF = s + Ls * ss; diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 3847b0d9..05143551 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -264,8 +264,8 @@ private: { CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); + auto T_v = T.View(CpuWrite); + auto F_v = F.View(CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); @@ -282,8 +282,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); + auto T_v = T.View(CpuWrite); + auto F_v = F.View(CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = -F_v[i]()(); @@ -300,8 +300,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); + auto T_v = T.View(CpuWrite); + auto F_v = F.View(CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); @@ -318,8 +318,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); + auto T_v = T.View(CpuWrite); + auto F_v = F.View(CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = timesI(F_v[i]()()); @@ -336,8 +336,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); + auto T_v = T.View(CpuWrite); + auto F_v = F.View(CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = -(F_v[i]()()); @@ -355,8 +355,8 @@ private: T = Zero(); - auto T_v = T.View(); - auto F_v = F.View(); + auto T_v = T.View(CpuWrite); + auto F_v = F.View(CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 0) = timesI(F_v[i]()()); diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index e78023cf..356d0941 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -106,9 +106,9 @@ public: const _SpinorField & phi, int mu) { - auto out_v= out.View(); - auto phi_v= phi.View(); - auto Umu_v= Umu.View(); + auto out_v= out.View(CpuWrite); + auto phi_v= phi.View(CpuRead); + auto Umu_v= Umu.View(CpuRead); thread_for(sss,out.Grid()->oSites(),{ multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); }); @@ -191,9 +191,9 @@ public: int Ls=Btilde.Grid()->_fdimensions[0]; GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(); - auto Btilde_v = Btilde.View(); - auto Atilde_v = Atilde.View(); + auto tmp_v = tmp.View(CpuWrite); + auto Btilde_v = Btilde.View(CpuRead); + auto Atilde_v = Atilde.View(CpuRead); thread_for(sss,tmp.Grid()->oSites(),{ int sU=sss; for(int s=0;s::M5D(const FermionField &psi_i, chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto phi = phi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -93,9 +93,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto phi = phi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -131,8 +131,8 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); int Ls=this->Ls; @@ -193,8 +193,8 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi GridBase *grid=psi_i.Grid(); int Ls=this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); auto plee = & lee [0]; auto pdee = & dee [0]; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h index 034ce642..079ea481 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h @@ -65,9 +65,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(CpuRead); + auto phi = phi_i.View(CpuRead); + auto chi = chi_i.View(CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; const int nsimd= Simd::Nsimd(); @@ -213,9 +213,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi=psi_i.View(); - auto phi=phi_i.View(); - auto chi=chi_i.View(); + auto psi=psi_i.View(CpuRead); + auto phi=phi_i.View(CpuRead); + auto chi=chi_i.View(CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; int nsimd= Simd::Nsimd(); @@ -357,8 +357,8 @@ CayleyFermion5D::MooeeInternalAsm(const FermionField &psi_i, FermionField Vector > &Matm) { EnableIf sfinae=0; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(CpuRead); + auto chi = chi_i.View(CpuWrite); #ifndef AVX512 { SiteHalfSpinor BcastP; @@ -535,8 +535,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField EnableIf sfinae=0; #ifndef AVX512 { - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(CpuRead); + auto chi = chi_i.View(CpuWrite); SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; @@ -586,8 +586,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField } #else { - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(CpuRead); + auto chi = chi_i.View(CpuWrite); // pointers // MASK_REGS; #define Chi_00 %zmm0 diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 46d3fa1f..100eb0d2 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -46,9 +46,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; GridBase* grid = psi_i.Grid(); - auto phi = phi_i.View(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto phi = phi_i.View(AcceleratorRead); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -82,9 +82,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio GridBase* grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto phi = phi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -116,8 +116,8 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi=psi_i.View(); - auto chi=chi_i.View(); + auto psi=psi_i.View(AcceleratorRead); + auto chi=chi_i.View(AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; @@ -172,8 +172,8 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 23692d49..01d5578f 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionFi Compressor compressor; Stencil.HaloExchange(in,compressor); - auto Umu_v = Umu.View(); - auto UUUmu_v = UUUmu.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto Umu_v = Umu.View(CpuRead); + auto UUUmu_v = UUUmu.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); thread_for( ss,Umu.Grid()->oSites(),{ for(int s=0;s::DhopInternalOverlappedComms(StencilImpl & } // do the compute - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto U_v = U.View(CpuRead); + auto UUU_v = UUU.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); if (dag == DaggerYes) { for (int ss = myblock; ss < myblock+myn; ++ss) { @@ -376,10 +376,10 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & DhopComputeTime2-=usecond(); - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto U_v = U.View(CpuRead); + auto UUU_v = UUU.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); if (dag == DaggerYes) { int sz=st.surface_list.size(); thread_for( ss,sz,{ @@ -418,10 +418,10 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, DhopComputeTime -= usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto U_v = U.View(CpuRead); + auto UUU_v = UUU.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); if (dag == DaggerYes) { thread_for( ss,U.Grid()->oSites(),{ int sU=ss; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 37675da0..1e59c4e7 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -250,10 +250,10 @@ void ImprovedStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGauge //////////////////////// // Call the single hop //////////////////////// - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto B_v = B.View(); - auto Btilde_v = Btilde.View(); + auto U_v = U.View(CpuRead); + auto UUU_v = UUU.View(CpuRead); + auto B_v = B.View(CpuWrite); + auto Btilde_v = Btilde.View(CpuWrite); thread_for(sss,B.Grid()->oSites(),{ Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); }); @@ -378,10 +378,10 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel Compressor compressor; Stencil.HaloExchange(in, compressor); - auto Umu_v = Umu.View(); - auto UUUmu_v = UUUmu.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto Umu_v = Umu.View(CpuRead); + auto UUUmu_v = UUUmu.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); thread_for( sss, in.Grid()->oSites(),{ Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); }); @@ -449,10 +449,10 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st } // do the compute - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto U_v = U.View(CpuRead); + auto UUU_v = UUU.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); if (dag == DaggerYes) { for (int ss = myblock; ss < myblock+myn; ++ss) { int sU = ss; @@ -479,10 +479,10 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st DhopComputeTime2 -= usecond(); { - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto U_v = U.View(CpuRead); + auto UUU_v = UUU.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); if (dag == DaggerYes) { int sz=st.surface_list.size(); thread_for(ss,sz,{ @@ -520,10 +520,10 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le st.HaloExchange(in, compressor); DhopCommTime += usecond(); - auto U_v = U.View(); - auto UUU_v = UUU.View(); - auto in_v = in.View(); - auto out_v = out.View(); + auto U_v = U.View(CpuRead); + auto UUU_v = UUU.View(CpuRead); + auto in_v = in.View(CpuRead); + auto out_v = out.View(CpuWrite); DhopComputeTime -= usecond(); if (dag == DaggerYes) { thread_for(sss, in.Grid()->oSites(),{ diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index f74c7a51..ed7be056 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -44,9 +44,9 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto phi = phi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -84,9 +84,9 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto phi = phi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); auto pm = this->pm; int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator @@ -132,9 +132,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto phi = phi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -174,9 +174,9 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm GridBase *grid = psi_i.Grid(); int Ls = this->Ls; int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator - auto psi = psi_i.View(); - auto phi = phi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto phi = phi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -226,8 +226,8 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -286,8 +286,8 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); auto pm = this->pm; auto plee = & this->lee [0]; @@ -354,8 +354,8 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -410,8 +410,8 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); - auto psi = psi_i.View(); - auto chi = chi_i.View(); + auto psi = psi_i.View(AcceleratorRead); + auto chi = chi_i.View(AcceleratorWrite); int Ls = this->Ls; auto pm = this->pm; diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index be05fcf8..9e492831 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -475,12 +475,12 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, // Inefficient comms method but not performance critical. tmp1 = Cshift(q_in_1, mu, 1); tmp2 = Cshift(q_in_2, mu, 1); - auto tmp1_v = tmp1.View(); - auto tmp2_v = tmp2.View(); - auto q_in_1_v=q_in_1.View(); - auto q_in_2_v=q_in_2.View(); - auto q_out_v = q_out.View(); - auto Umu_v = Umu.View(); + auto tmp1_v = tmp1.View(CpuWrite); + auto tmp2_v = tmp2.View(CpuWrite); + auto q_in_1_v=q_in_1.View(CpuRead); + auto q_in_2_v=q_in_2.View(CpuRead); + auto q_out_v = q_out.View(CpuRead); + auto Umu_v = Umu.View(CpuRead); thread_for(sU, Umu.Grid()->oSites(),{ Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU], q_in_2_v[sU], @@ -526,11 +526,11 @@ void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, tmp = lattice_cmplx*q_in; tmpBwd = Cshift(tmp, mu, -1); - auto coords_v = coords.View(); - auto tmpFwd_v = tmpFwd.View(); - auto tmpBwd_v = tmpBwd.View(); - auto Umu_v = Umu.View(); - auto q_out_v = q_out.View(); + auto coords_v = coords.View(CpuRead); + auto tmpFwd_v = tmpFwd.View(CpuRead); + auto tmpBwd_v = tmpBwd.View(CpuRead); + auto Umu_v = Umu.View(CpuRead); + auto q_out_v = q_out.View(CpuWrite); thread_for(sU, Umu.Grid()->oSites(), { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 14a2ec9e..587bf42c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -348,18 +348,18 @@ template void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, std::vector &out) { - auto U_v = U.View(); - auto in_v = in.View(); - auto st_v = st.View(); + auto U_v = U.View(AcceleratorRead); + auto in_v = in.View(AcceleratorRead); + auto st_v = st.View(AcceleratorRead); - auto out_Xm = out[0].View(); - auto out_Ym = out[1].View(); - auto out_Zm = out[2].View(); - auto out_Tm = out[3].View(); - auto out_Xp = out[4].View(); - auto out_Yp = out[5].View(); - auto out_Zp = out[6].View(); - auto out_Tp = out[7].View(); + auto out_Xm = out[0].View(AcceleratorWrite); + auto out_Ym = out[1].View(AcceleratorWrite); + auto out_Zm = out[2].View(AcceleratorWrite); + auto out_Tm = out[3].View(AcceleratorWrite); + auto out_Xp = out[4].View(AcceleratorWrite); + auto out_Yp = out[5].View(AcceleratorWrite); + auto out_Zp = out[6].View(AcceleratorWrite); + auto out_Tp = out[7].View(AcceleratorWrite); auto CBp=st.CommBuf(); accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ int sU=sss/Ls; @@ -383,10 +383,10 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S assert(dirdisp<=7); assert(dirdisp>=0); - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + auto U_v = U.View(AcceleratorRead); + auto in_v = in.View(AcceleratorRead); + auto out_v = out.View(AcceleratorWrite); + auto st_v = st.View(AcceleratorRead); auto CBp=st.CommBuf(); #define LoopBody(Dir) \ case Dir : \ @@ -438,10 +438,10 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + auto U_v = U.View(AcceleratorRead); + auto in_v = in.View(AcceleratorRead); + auto out_v = out.View(AcceleratorWrite); + auto st_v = st.View(AcceleratorRead); if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} @@ -469,10 +469,10 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { - auto U_v = U.View(); - auto in_v = in.View(); - auto out_v = out.View(); - auto st_v = st.View(); + auto U_v = U.View(AcceleratorRead); + auto in_v = in.View(AcceleratorRead); + auto out_v = out.View(AcceleratorWrite); + auto st_v = st.View(AcceleratorRead); if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index b9a5296d..79549dcb 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -86,8 +86,8 @@ public: // Move this elsewhere? FIXME static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W - auto U_v = U.View(); - auto W_v = W.View(); + auto U_v = U.View(CpuWrite); + auto W_v = W.View(CpuRead); thread_for( ss, U.Grid()->oSites(), { U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); }); @@ -131,8 +131,8 @@ public: //static std::chrono::duration diff; //auto start = std::chrono::high_resolution_clock::now(); - auto U_v = U.View(); - auto P_v = P.View(); + auto U_v = U.View(CpuWrite); + auto P_v = P.View(CpuRead); thread_for(ss, P.Grid()->oSites(),{ for (int mu = 0; mu < Nd; mu++) { U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h index 3be84480..7ac85d56 100644 --- a/Grid/qcd/action/scalar/ScalarInteractionAction.h +++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h @@ -89,8 +89,8 @@ public: action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared; - auto p_v = p.View(); - auto action_v = action.View(); + auto p_v = p.View(CpuRead); + auto action_v = action.View(CpuWrite); for (int mu = 0; mu < Ndim; mu++) { // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils @@ -146,8 +146,8 @@ public: for (int point = 0; point < npoint; point++) { - auto p_v = p.View(); - auto force_v = force.View(); + auto p_v = p.View(CpuRead); + auto force_v = force.View(CpuWrite); int permute_type; StencilEntry *SE; diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h index f4d00c72..0ff7fc25 100644 --- a/Grid/qcd/smearing/GaugeConfiguration.h +++ b/Grid/qcd/smearing/GaugeConfiguration.h @@ -49,7 +49,7 @@ public: private: const unsigned int smearingLevels; - Smear_Stout StoutSmearing; + Smear_Stout *StoutSmearing; std::vector SmearedSet; // Member functions @@ -72,7 +72,7 @@ private: previous_u = *ThinLinks; for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) { - StoutSmearing.smear(SmearedSet[smearLvl], previous_u); + StoutSmearing->smear(SmearedSet[smearLvl], previous_u); previous_u = SmearedSet[smearLvl]; // For debug purposes @@ -93,7 +93,7 @@ private: GaugeLinkField SigmaKPrime_mu(grid); GaugeLinkField GaugeKmu(grid), Cmu(grid); - StoutSmearing.BaseSmear(C, GaugeK); + StoutSmearing->BaseSmear(C, GaugeK); SigmaK = Zero(); iLambda = Zero(); @@ -107,7 +107,7 @@ private: pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); pokeLorentz(iLambda, iLambda_mu, mu); } - StoutSmearing.derivative(SigmaK, iLambda, + StoutSmearing->derivative(SigmaK, iLambda, GaugeK); // derivative of SmearBase return SigmaK; } @@ -144,14 +144,14 @@ private: // Exponential iQ2 = iQ * iQ; iQ3 = iQ * iQ2; - StoutSmearing.set_uw(u, w, iQ2, iQ3); - StoutSmearing.set_fj(f0, f1, f2, u, w); + StoutSmearing->set_uw(u, w, iQ2, iQ3); + StoutSmearing->set_fj(f0, f1, f2, u, w); e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2; // Getting B1, B2, Gamma and Lambda // simplify this part, reduntant calculations in set_fj - xi0 = StoutSmearing.func_xi0(w); - xi1 = StoutSmearing.func_xi1(w); + xi0 = StoutSmearing->func_xi0(w); + xi1 = StoutSmearing->func_xi1(w); u2 = u * u; w2 = w * w; cosw = cos(w); @@ -219,7 +219,7 @@ public: /* Standard constructor */ SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear, Smear_Stout& Stout) - : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) + : smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL) { for (unsigned int i = 0; i < smearingLevels; ++i) SmearedSet.push_back(*(new GaugeField(UGrid))); @@ -227,7 +227,7 @@ public: /*! For just thin links */ SmearedConfiguration() - : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} + : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {} // attach the smeared routines to the thin links U and fill the smeared set void set_Field(GaugeField &U) diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index c7c7d329..7ad496b7 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -185,13 +185,13 @@ void A2Autils::MesonField(TensorType &mat, for(int i=0;i::MesonField(TensorType &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::PionFieldXX(Eigen::Tensor &mat, for(int i=0;i::PionFieldXX(Eigen::Tensor &mat, } for(int j=0;j::PionFieldWVmom(Eigen::Tensor &mat, for(int i=0;i::PionFieldWVmom(Eigen::Tensor &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::AslashField(TensorType &mat, for(int i=0;i::AslashField(TensorType &mat, for ( int m=0;m::ContractWWVV(std::vector &WWVV, for(int d_o=0;d_o::ContractWWVV(std::vector &WWVV, thread_for(ss,grid->oSites(),{ for(int d_o=0;d_o::OuterProductWWVV(PropagatorField &WWVV, const vobj &rhs, const int Ns, const int ss) { - auto WWVV_v = WWVV.View(); + auto WWVV_v = WWVV.View(CpuWrite); for (int s1 = 0; s1 < Ns; s1++){ for (int s2 = 0; s2 < Ns; s2++){ WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0); @@ -1122,10 +1122,10 @@ void A2Autils::ContractFourQuarkColourDiagonal(const PropagatorField &WWV GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(); - auto WWVV1_v = WWVV1.View(); - auto O_trtr_v= O_trtr.View(); - auto O_fig8_v= O_fig8.View(); + auto WWVV0_v = WWVV0.View(CpuRead); + auto WWVV1_v = WWVV1.View(CpuRead); + auto O_trtr_v= O_trtr.View(CpuWrite); + auto O_fig8_v= O_fig8.View(CpuWrite); thread_for(ss,grid->oSites(),{ typedef typename ComplexField::vector_object vobj; @@ -1166,10 +1166,10 @@ void A2Autils::ContractFourQuarkColourMix(const PropagatorField &WWVV0, GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(); - auto WWVV1_v = WWVV1.View(); - auto O_trtr_v= O_trtr.View(); - auto O_fig8_v= O_fig8.View(); + auto WWVV0_v = WWVV0.View(CpuRead); + auto WWVV1_v = WWVV1.View(CpuRead); + auto O_trtr_v= O_trtr.View(CpuWrite); + auto O_fig8_v= O_fig8.View(CpuWrite); thread_for(ss,grid->oSites(),{ diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index d65b9176..d45fd93d 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -273,10 +273,10 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, for (int ie=0; ie < 6 ; ie++) wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - auto vbaryon_corr= baryon_corr.View(); - auto v1 = q1_left.View(); - auto v2 = q2_left.View(); - auto v3 = q3_left.View(); + auto vbaryon_corr= baryon_corr.View(CpuWrite); + auto v1 = q1_left.View(CpuRead); + auto v2 = q2_left.View(CpuRead); + auto v3 = q3_left.View(CpuRead); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ @@ -560,10 +560,10 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, { GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(); - auto vq_loop = qq_loop.View(); - auto vd_tf = qd_tf.View(); - auto vs_ti = qs_ti.View(); + auto vcorr= stn_corr.View(CpuWrite); + auto vq_loop = qq_loop.View(CpuRead); + auto vd_tf = qd_tf.View(CpuRead); + auto vs_ti = qs_ti.View(CpuRead); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ @@ -597,11 +597,11 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, { GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(); - auto vq_ti = qq_ti.View(); - auto vq_tf = qq_tf.View(); - auto vd_tf = qd_tf.View(); - auto vs_ti = qs_ti.View(); + auto vcorr= stn_corr.View(CpuWrite); + auto vq_ti = qq_ti.View(CpuRead); + auto vq_tf = qq_tf.View(CpuRead); + auto vd_tf = qd_tf.View(CpuRead); + auto vs_ti = qs_ti.View(CpuRead); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h index 56f8f164..0adbfabf 100644 --- a/Grid/qcd/utils/LinalgUtils.h +++ b/Grid/qcd/utils/LinalgUtils.h @@ -47,8 +47,8 @@ void axpibg5x(Lattice &z,const Lattice &x,Coeff a,Coeff b) GridBase *grid=x.Grid(); Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); accelerator_for( ss, x_v.size(),vobj::Nsimd(), { auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss))); coalescedWrite(z_v[ss],tmp); @@ -63,9 +63,9 @@ void axpby_ssp(Lattice &z, Coeff a,const Lattice &x,Coeff b,const La conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); // FIXME -- need a new class of accelerator_loop to implement this // uint64_t nloop = grid->oSites()/Ls; @@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice &z,Coeff a,const Lattice &x,Coeff b,con conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(); - auto y_v = y.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto y_v = y.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -189,8 +189,8 @@ void G5R5(Lattice &z,const Lattice &x) conformable(x,z); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(); - auto z_v = z.View(); + auto x_v = x.View(AcceleratorRead); + auto z_v = z.View(AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -222,8 +222,8 @@ void G5C(Lattice> &z, const LatticeoSites(),CComplex::Nsimd(), { for(int n = 0; n < nb; ++n) { diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 7ad80d00..5f98f926 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -222,9 +222,9 @@ public: conformable(subgroup, Determinant); int i0, i1; su2SubGroupIndex(i0, i1, su2_index); - auto subgroup_v = subgroup.View(); - auto source_v = source.View(); - auto Determinant_v = Determinant.View(); + auto subgroup_v = subgroup.View(CpuWrite); + auto source_v = source.View(CpuRead); + auto Determinant_v = Determinant.View(CpuWrite); thread_for(ss, grid->oSites(), { @@ -257,8 +257,8 @@ public: su2SubGroupIndex(i0, i1, su2_index); dest = 1.0; // start out with identity - auto dest_v = dest.View(); - auto subgroup_v = subgroup.View(); + auto dest_v = dest.View(CpuWrite); + auto subgroup_v = subgroup.View(CpuRead); thread_for(ss, grid->oSites(), { dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0); diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 7a200ba6..d70bac93 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -67,7 +67,7 @@ void Gather_plane_simple_table (Vector >& table,const Lattice { int num=table.size(); std::pair *table_v = & table[0]; - auto rhs_v = rhs.View(); + auto rhs_v = rhs.View(AcceleratorRead); accelerator_forNB( i,num, vobj::Nsimd(), { typedef decltype(coalescedRead(buffer[0])) compressed_t; compressed_t tmp_c; @@ -94,7 +94,7 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic int num=table.size()/2; int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane - auto rhs_v = rhs.View(); + auto rhs_v = rhs.View(AcceleratorRead); auto p0=&pointers[0][0]; auto p1=&pointers[1][0]; auto tp=&table[0]; @@ -122,7 +122,7 @@ struct StencilEntry { // Could pack to 8 + 4 + 4 = 128 bit and use template -class CartesianStencilView { +class CartesianStencilAccelerator { public: typedef AcceleratorVector StencilVector; @@ -130,14 +130,15 @@ class CartesianStencilView { //////////////////////////////////////// // Basic Grid and stencil info //////////////////////////////////////// - int _checkerboard; - int _npoints; // Move to template param? + int _checkerboard; + int _npoints; // Move to template param? + int _osites; StencilVector _directions; StencilVector _distances; StencilVector _comm_buf_size; StencilVector _permute_type; StencilVector same_node; - Coordinate _simd_layout; + Coordinate _simd_layout; Parameters parameters; StencilEntry* _entries_p; cobj* u_recv_buf_p; @@ -175,13 +176,37 @@ class CartesianStencilView { { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } +}; + +template +class CartesianStencilView : public CartesianStencilAccelerator +{ + std::shared_ptr Deleter; + public: + // + CartesianStencilView (const CartesianStencilView &refer_to_me) + : CartesianStencilAccelerator(refer_to_me), Deleter(refer_to_me.Deleter) + { } + CartesianStencilView (const CartesianStencilAccelerator &refer_to_me,ViewMode mode) + : CartesianStencilAccelerator(refer_to_me), Deleter(new MemViewDeleter) + { + Deleter->cpu_ptr =(void *)this->_entries_p; + Deleter->mode = mode; + this->_entries_p =(StencilEntry *) + + AllocationCache::ViewOpen(this->_entries_p, + this->_npoints*this->_osites*sizeof(StencilEntry), + mode, + AdviseDefault); + } }; + //////////////////////////////////////// // The Stencil Class itself //////////////////////////////////////// template -class CartesianStencil : public CartesianStencilView { // Stencil runs along coordinate axes only; NO diagonal fill in. +class CartesianStencil : public CartesianStencilAccelerator { // Stencil runs along coordinate axes only; NO diagonal fill in. public: typedef typename cobj::vector_type vector_type; @@ -226,8 +251,8 @@ public: // Generalise as required later if needed //////////////////////////////////////////////////////////////////////// - View_type View(void) const { - View_type accessor(*( (View_type *) this)); + View_type View(ViewMode mode) const { + View_type accessor(*( (View_type *) this),mode); return accessor; } @@ -662,9 +687,9 @@ public: _unified_buffer_size=0; surface_list.resize(0); - int osites = _grid->oSites(); + this->_osites = _grid->oSites(); - _entries.resize(this->_npoints* osites); + _entries.resize(this->_npoints* this->_osites); this->_entries_p = &_entries[0]; for(int ii=0;ii>()[2]; } // SYCL specific +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; +#else + return 0; +#endif +} // SYCL specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ @@ -224,7 +236,13 @@ NAMESPACE_BEGIN(Grid); #define accelerator_inline __host__ __device__ inline /*These routines define mapping from thread grid to loop & vector lane indexing */ -accelerator_inline int acceleratorSIMTlane(int Nsimd) { return hipThreadIdx_z; } // HIP specific +accelerator_inline int acceleratorSIMTlane(int Nsimd) { +#ifdef GRID_SIMT + return hipThreadIdx_z; +#else + return 0; +#endif +} // HIP specific #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \