diff --git a/Grid/algorithms/Algorithms.h b/Grid/algorithms/Algorithms.h index 48ea194b..7f27784b 100644 --- a/Grid/algorithms/Algorithms.h +++ b/Grid/algorithms/Algorithms.h @@ -29,9 +29,11 @@ Author: Peter Boyle #ifndef GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H +NAMESPACE_CHECK(algorithms); #include #include #include +NAMESPACE_CHECK(SparseMatrix); #include #include @@ -41,10 +43,12 @@ Author: Peter Boyle #include #include #include - +NAMESPACE_CHECK(approx); #include #include +NAMESPACE_CHECK(ConjGrad); #include +NAMESPACE_CHECK(BiCGSTAB); #include #include #include @@ -62,7 +66,9 @@ Author: Peter Boyle #include #include +NAMESPACE_CHECK(PowerMethod); #include +NAMESPACE_CHECK(CoarsendMatrix); #include #endif diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 4493d740..fb14ac32 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -186,10 +186,10 @@ public: hermop.HermOp(*Tn,y); - auto y_v = y.View(AcceleratorWrite); - auto Tn_v = Tn->View(AcceleratorWrite); - auto Tnp_v = Tnp->View(AcceleratorWrite); - auto Tnm_v = Tnm->View(AcceleratorWrite); + autoView( y_v , y, AcceleratorWrite); + autoView( Tn_v , (*Tn), AcceleratorWrite); + autoView( Tnp_v , (*Tnp), AcceleratorWrite); + autoView( Tnm_v , (*Tnm), AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); @@ -246,13 +246,14 @@ public: CartesianStencil Stencil; std::vector A; - + /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know - RealD M (const CoarseVector &in, CoarseVector &out){ + RealD M (const CoarseVector &in, CoarseVector &out) + { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); @@ -263,12 +264,13 @@ public: double comms_usec = -usecond(); Stencil.HaloExchange(in,compressor); comms_usec += usecond(); - - auto in_v = in.View(AcceleratorRead); - auto out_v = out.View(AcceleratorWrite); + + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); typedef LatticeView Aview; - + Vector AcceleratorViewContainer; + for(int p=0;p &out) { @@ -542,10 +547,10 @@ public: blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi); - auto iZProj_v = iZProj.View(AcceleratorRead) ; - auto oZProj_v = oZProj.View(AcceleratorRead) ; - auto A_p = A[p].View(AcceleratorWrite); - auto A_self = A[self_stencil].View(AcceleratorWrite); + autoView( iZProj_v , iZProj, AcceleratorRead) ; + autoView( oZProj_v , oZProj, AcceleratorRead) ; + autoView( A_p , A[p], AcceleratorWrite); + autoView( A_self , A[self_stencil], AcceleratorWrite); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); // if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });} @@ -563,11 +568,11 @@ public: mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); { - auto tmp_ = tmp.View(AcceleratorWrite); - auto evenmask_ = evenmask.View(AcceleratorRead); - auto oddmask_ = oddmask.View(AcceleratorRead); - auto Mphie_ = Mphie.View(AcceleratorRead); - auto Mphio_ = Mphio.View(AcceleratorRead); + autoView( tmp_ , tmp, AcceleratorWrite); + autoView( evenmask_ , evenmask, AcceleratorRead); + autoView( oddmask_ , oddmask, AcceleratorRead); + autoView( Mphie_ , Mphie, AcceleratorRead); + autoView( Mphio_ , Mphio, AcceleratorRead); accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); }); @@ -575,8 +580,8 @@ public: blockProject(SelfProj,tmp,Subspace.subspace); - auto SelfProj_ = SelfProj.View(AcceleratorRead); - auto A_self = A[self_stencil].View(AcceleratorWrite); + autoView( SelfProj_ , SelfProj, AcceleratorRead); + autoView( A_self , A[self_stencil], AcceleratorWrite); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ for(int j=0;j #endif #endif - NAMESPACE_BEGIN(Grid); template struct FFTW { }; @@ -190,7 +189,7 @@ public: typedef typename sobj::scalar_type scalar; Lattice pgbuf(&pencil_g); - auto pgbuf_v = pgbuf.View(CpuWrite); + autoView(pgbuf_v , pgbuf, CpuWrite); typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; diff --git a/Grid/algorithms/iterative/BiCGSTAB.h b/Grid/algorithms/iterative/BiCGSTAB.h index 04328a77..f4e5cdda 100644 --- a/Grid/algorithms/iterative/BiCGSTAB.h +++ b/Grid/algorithms/iterative/BiCGSTAB.h @@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction LinearCombTimer.Start(); bo = beta * omega; - auto p_v = p.View(AcceleratorWrite); - auto r_v = r.View(AcceleratorWrite); - auto v_v = v.View(AcceleratorWrite); - accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); - }); + { + autoView( p_v , p, AcceleratorWrite); + autoView( r_v , r, AcceleratorRead); + autoView( v_v , v, AcceleratorRead); + accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); @@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction alpha = rho / Calpha.real(); LinearCombTimer.Start(); - auto h_v = h.View(AcceleratorWrite); - auto psi_v = psi.View(AcceleratorWrite); - accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); - }); - - auto s_v = s.View(AcceleratorWrite); - accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); - }); + { + autoView( p_v , p, AcceleratorRead); + autoView( r_v , r, AcceleratorRead); + autoView( v_v , v, AcceleratorRead); + autoView( psi_v,psi, AcceleratorRead); + autoView( h_v , h, AcceleratorWrite); + autoView( s_v , s, AcceleratorWrite); + accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); + }); + accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); + }); + } LinearCombTimer.Stop(); LinalgTimer.Stop(); @@ -166,13 +172,19 @@ class BiCGSTAB : public OperatorFunction omega = Comega.real() / norm2(t); LinearCombTimer.Start(); - auto t_v = t.View(AcceleratorWrite); - accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ - coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); - coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); - }); + { + autoView( psi_v,psi, AcceleratorWrite); + autoView( r_v , r, AcceleratorWrite); + autoView( h_v , h, AcceleratorRead); + autoView( s_v , s, AcceleratorRead); + autoView( t_v , t, AcceleratorRead); + accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ + coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); + coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); + }); + } LinearCombTimer.Stop(); - + cp = norm2(r); LinalgTimer.Stop(); diff --git a/Grid/algorithms/iterative/ConjugateGradient.h b/Grid/algorithms/iterative/ConjugateGradient.h index c8180a11..14f3d306 100644 --- a/Grid/algorithms/iterative/ConjugateGradient.h +++ b/Grid/algorithms/iterative/ConjugateGradient.h @@ -141,16 +141,16 @@ public: LinearCombTimer.Start(); { - auto psi_v = psi.View(AcceleratorWrite); - auto p_v = p.View(AcceleratorWrite); - auto r_v = r.View(AcceleratorWrite); + autoView( psi_v , psi, AcceleratorWrite); + autoView( p_v , p, AcceleratorWrite); + autoView( r_v , r, AcceleratorWrite); accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); - }); - LinearCombTimer.Stop(); - LinalgTimer.Stop(); + }); } + LinearCombTimer.Stop(); + LinalgTimer.Stop(); std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index 05ed8586..d2bec856 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -57,16 +57,17 @@ void basisOrthogonalize(std::vector &basis,Field &w,int k) template void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) { - typedef decltype(basis[0].View(CpuWrite)) View; - auto tmp_v = basis[0].View(CpuWrite); - Vector basis_v(basis.size(),tmp_v); - View *basis_vp = &basis_v[0]; - typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); - for(int k=0;k basis_v; basis_v.reserve(basis.size()); + + for(int k=0;k > Bt(thread_max() * Nm); // Thread private thread_region @@ -142,6 +143,7 @@ void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, i coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); }); } + for(int k=0;k &basis,Eigen::MatrixXd& Qt,int j0, int j1, i template void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) { - typedef decltype(basis[0].View(AcceleratorWrite)) View; - typedef typename Field::vector_object vobj; GridBase* grid = basis[0].Grid(); + typedef typename Field::vector_object vobj; + typedef decltype(basis[0].View(AcceleratorWrite)) View; result.Checkerboard() = basis[0].Checkerboard(); - auto result_v=result.View(AcceleratorWrite); - Vector basis_v(basis.size(),result_v); + + autoView(result_v,result, AcceleratorWrite); + Vector basis_v; basis_v.reserve(basis.size()); View * basis_vp = &basis_v[0]; - for(int k=0;k Qt_jv(Nm); - double * Qt_j = & Qt_jv[0]; + + for(int k=0;k Qt_jv(Nm); double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ auto B=coalescedRead(basis_vp[k0][ss]); B=Zero(); @@ -171,6 +175,7 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in } coalescedWrite(result_v[ss], B); }); + for(int k=0;k diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index 3ddd98b3..6e38d062 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -169,8 +169,9 @@ private: public: static void Print(void); - static void ViewClose(void* AccPtr,ViewMode mode); - static void *ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); + static int isOpen (void* CpuPtr); + static void ViewClose(void* CpuPtr,ViewMode mode); + static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); }; diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index fa253c6c..a3408095 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -461,6 +461,17 @@ void MemoryManager::Print(void) std::cout << GridLogDebug << "--------------------------------------------" << std::endl; }; +int MemoryManager::isOpen (void* _CpuPtr) +{ + uint64_t CpuPtr = (uint64_t)_CpuPtr; + if ( EntryPresent(CpuPtr) ){ + auto AccCacheIterator = EntryLookup(CpuPtr); + auto & AccCache = AccCacheIterator->second; + return AccCache.cpuLock+AccCache.accLock; + } else { + return 0; + } +} NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 0008add4..e7e67753 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -9,6 +9,7 @@ NAMESPACE_BEGIN(Grid); void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; +int MemoryManager::isOpen (void* CpuPtr) { return 0;} void MemoryManager::Print(void){}; void MemoryManager::NotifyDeletion(void *ptr){}; diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 1c99e797..4de2bbe2 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -72,12 +72,14 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen } } } - auto rhs_v = rhs.View(AcceleratorRead); - auto buffer_p = & buffer[0]; - auto table = &Cshift_table[0]; - accelerator_for(i,ent,1,{ - buffer_p[table[i].first]=rhs_v[table[i].second]; - }); + { + autoView(rhs_v , rhs, AcceleratorRead); + auto buffer_p = & buffer[0]; + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; + }); + } } /////////////////////////////////////////////////////////////////// @@ -100,8 +102,8 @@ Gather_plane_extract(const Lattice &rhs, int e2=rhs.Grid()->_slice_block[dimension]; int n1=rhs.Grid()->_slice_stride[dimension]; - auto rhs_v = rhs.View(AcceleratorRead); if ( cbmask ==0x3){ + autoView(rhs_v , rhs, AcceleratorRead); accelerator_for2d(n,e1,b,e2,1,{ int o = n*n1; int offset = b+n*e2; @@ -110,8 +112,8 @@ Gather_plane_extract(const Lattice &rhs, extract(temp,pointers,offset); }); } else { + autoView(rhs_v , rhs, AcceleratorRead); - Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - auto rhs_v = rhs.View(AcceleratorWrite); + autoView( rhs_v , rhs, AcceleratorWrite); accelerator_for2d(n,e1,b,e2,1,{ int o = n*rhs.Grid()->_slice_stride[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension]; @@ -216,7 +220,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA // Test_cshift_red_black code. // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<_slice_stride[dimension]; @@ -272,13 +276,14 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs } } - auto rhs_v = rhs.View(AcceleratorRead); - auto lhs_v = lhs.View(AcceleratorWrite); - auto table = &Cshift_table[0]; - accelerator_for(i,ent,1,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; - }); - + { + autoView(rhs_v , rhs, AcceleratorRead); + autoView(lhs_v , lhs, AcceleratorWrite); + auto table = &Cshift_table[0]; + accelerator_for(i,ent,1,{ + lhs_v[table[i].first]=rhs_v[table[i].second]; + }); + } } template void Copy_plane_permute(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) @@ -315,12 +320,14 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice accelerator_inline -const lobj & eval(const uint64_t ss, const LatticeExprView &arg) +const lobj & eval(const uint64_t ss, const LatticeView &arg) { return arg[ss]; } // What needs this? +// Cannot be legal on accelerator +// Comparison must convert #if 1 template accelerator_inline const lobj & eval(const uint64_t ss, const Lattice &arg) diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index b1252952..c204af5c 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -36,9 +36,9 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); - auto rhs_v = rhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); conformable(ret,rhs); conformable(lhs,rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ @@ -55,9 +55,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); - auto rhs_v = rhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -72,9 +72,9 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); - auto rhs_v = rhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -88,9 +88,9 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); - auto rhs_v = rhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -107,8 +107,8 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs_v(ss),&rhs); @@ -120,8 +120,8 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -134,8 +134,8 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -147,8 +147,8 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -164,8 +164,8 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(AcceleratorWrite); - auto rhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -178,8 +178,8 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(AcceleratorWrite); - auto rhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -192,8 +192,8 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(AcceleratorWrite); - auto rhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -205,8 +205,8 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(AcceleratorWrite); - auto rhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( rhs_v , lhs, AcceleratorRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -220,9 +220,9 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(AcceleratorWrite); - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( x_v , x, AcceleratorRead); + autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+y_v(ss); coalescedWrite(ret_v[ss],tmp); @@ -233,9 +233,9 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(AcceleratorWrite); - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( x_v , x, AcceleratorRead); + autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 9aae3333..65f71441 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -84,6 +84,7 @@ public: ///////////////////////////////////////////////////////////////////////////////// void SetViewMode(ViewMode mode) { LatticeView accessor(*( (LatticeAccelerator *) this),mode); + accessor.ViewClose(); } ///////////////////////////////////////////////////////////////////////////////// // Return a view object that may be dereferenced in site loops. @@ -123,6 +124,7 @@ public: auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); ExpressionViewClose(exprCopy); return *this; } @@ -145,6 +147,7 @@ public: auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); ExpressionViewClose(exprCopy); return *this; } @@ -166,6 +169,7 @@ public: auto tmp = eval(ss,exprCopy); vstream(me[ss],tmp); }); + me.ViewClose(); ExpressionViewClose(exprCopy); return *this; } @@ -221,6 +225,7 @@ public: thread_for(ss,me.size(),{ me[ss]= r; }); + me.ViewClose(); return *this; } @@ -278,6 +283,7 @@ public: accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); + me.ViewClose(); him.ViewClose(); return *this; } @@ -292,6 +298,7 @@ public: accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); + me.ViewClose(); him.ViewClose(); return *this; } /////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_comparison.h b/Grid/lattice/Lattice_comparison.h index 17a61750..6a29be94 100644 --- a/Grid/lattice/Lattice_comparison.h +++ b/Grid/lattice/Lattice_comparison.h @@ -78,9 +78,9 @@ template inline Lattice LLComparison(vfunctor op,const Lattice &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(CpuRead); - auto rhs_v = rhs.View(CpuRead); - auto ret_v = ret.View(CpuWrite); + autoView( lhs_v, lhs, CpuRead); + autoView( rhs_v, rhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); }); @@ -93,8 +93,8 @@ template inline Lattice LSComparison(vfunctor op,const Lattice &lhs,const robj &rhs) { Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(CpuRead); - auto ret_v = ret.View(CpuWrite); + autoView( lhs_v, lhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, lhs_v.size(), { ret_v[ss]=op(lhs_v[ss],rhs); }); @@ -107,8 +107,8 @@ template inline Lattice SLComparison(vfunctor op,const lobj &lhs,const Lattice &rhs) { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(CpuRead); - auto ret_v = ret.View(CpuWrite); + autoView( rhs_v, rhs, CpuRead); + autoView( ret_v, ret, CpuWrite); thread_for( ss, rhs_v.size(), { ret_v[ss]=op(lhs,rhs_v[ss]); }); diff --git a/Grid/lattice/Lattice_coordinate.h b/Grid/lattice/Lattice_coordinate.h index b8e73b25..cd0f11ee 100644 --- a/Grid/lattice/Lattice_coordinate.h +++ b/Grid/lattice/Lattice_coordinate.h @@ -37,7 +37,7 @@ template inline void LatticeCoordinate(Lattice &l,int mu) GridBase *grid = l.Grid(); int Nsimd = grid->iSites(); - auto l_v = l.View(CpuWrite); + autoView(l_v, l, CpuWrite); thread_for( o, grid->oSites(), { vector_type vI; Coordinate gcoor; diff --git a/Grid/lattice/Lattice_local.h b/Grid/lattice/Lattice_local.h index e497a748..1b31e9b3 100644 --- a/Grid/lattice/Lattice_local.h +++ b/Grid/lattice/Lattice_local.h @@ -43,8 +43,8 @@ template inline auto localNorm2 (const Lattice &rhs)-> Lattice { Lattice ret(rhs.Grid()); - auto rhs_v = rhs.View(AcceleratorRead); - auto ret_v = ret.View(AcceleratorWrite); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); }); @@ -56,9 +56,9 @@ template inline auto localInnerProduct (const Lattice &lhs,const Lattice &rhs) -> Lattice { Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(AcceleratorRead); - auto rhs_v = rhs.View(AcceleratorRead); - auto ret_v = ret.View(AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); }); @@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice &lhs,const Lattice &rhs) -> Latt typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(rr())) srr; Lattice ret(rhs.Grid()); - auto lhs_v = lhs.View(AcceleratorRead); - auto rhs_v = rhs.View(AcceleratorRead); - auto ret_v = ret.View(AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); + autoView( rhs_v , rhs, AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); accelerator_for(ss,rhs_v.size(),1,{ // FIXME had issues with scalar version of outer // Use vector [] operator and don't read coalesce this loop diff --git a/Grid/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h index 88de5210..7c470fef 100644 --- a/Grid/lattice/Lattice_matrix_reduction.h +++ b/Grid/lattice/Lattice_matrix_reduction.h @@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(CpuRead); - auto Y_v = Y.View(CpuRead); - auto R_v = R.View(CpuWrite); + autoView( X_v , X, CpuRead); + autoView( Y_v , Y, CpuRead); + autoView( R_v , R, CpuWrite); thread_region { std::vector s_x(Nblock); @@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v = X.View(CpuRead); - auto R_v = R.View(CpuWrite); + autoView( X_v , X, CpuRead); + autoView( R_v , R, CpuWrite); thread_region { @@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice int ostride=FullGrid->_ostride[Orthog]; typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v = lhs.View(CpuRead); - auto rhs_v = rhs.View(CpuRead); + autoView( lhs_v , lhs, CpuRead); + autoView( rhs_v , rhs, CpuRead); thread_region { std::vector Left(Nblock); std::vector Right(Nblock); diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index af98c07b..2ec97b08 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -46,8 +46,8 @@ auto PeekIndex(const Lattice &lhs,int i) -> Lattice(vobj(),i))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(CpuWrite); - auto lhs_v = lhs.View(CpuRead); + autoView( ret_v, ret, CpuWrite); + autoView( lhs_v, lhs, CpuRead); thread_for( ss, lhs_v.size(), { ret_v[ss] = peekIndex(lhs_v[ss],i); }); @@ -58,8 +58,8 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice(vobj(),i,j))> ret(lhs.Grid()); ret.Checkerboard()=lhs.Checkerboard(); - auto ret_v = ret.View(CpuWrite); - auto lhs_v = lhs.View(CpuRead); + autoView( ret_v, ret, CpuWrite); + autoView( lhs_v, lhs, CpuRead); thread_for( ss, lhs_v.size(), { ret_v[ss] = peekIndex(lhs_v[ss],i,j); }); @@ -72,8 +72,8 @@ auto PeekIndex(const Lattice &lhs,int i,int j) -> Lattice void PokeIndex(Lattice &lhs,const Lattice(vobj(),0))> & rhs,int i) { - auto rhs_v = rhs.View(CpuRead); - auto lhs_v = lhs.View(CpuWrite); + autoView( rhs_v, rhs, CpuRead); + autoView( lhs_v, lhs, CpuWrite); thread_for( ss, lhs_v.size(), { pokeIndex(lhs_v[ss],rhs_v[ss],i); }); @@ -81,8 +81,8 @@ void PokeIndex(Lattice &lhs,const Lattice(vobj() template void PokeIndex(Lattice &lhs,const Lattice(vobj(),0,0))> & rhs,int i,int j) { - auto rhs_v = rhs.View(CpuRead); - auto lhs_v = lhs.View(CpuWrite); + autoView( rhs_v, rhs, CpuRead); + autoView( lhs_v, lhs, CpuWrite); thread_for( ss, lhs_v.size(), { pokeIndex(lhs_v[ss],rhs_v[ss],i,j); }); @@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ // extract-modify-merge cycle is easiest way and this is not perf critical ExtractBuffer buf(Nsimd); - auto l_v = l.View(CpuWrite); + autoView( l_v , l, CpuWrite); if ( rank == grid->ThisRank() ) { extract(l_v[odx],buf); buf[idx] = s; @@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ grid->GlobalCoorToRankIndex(rank,odx,idx,site); ExtractBuffer buf(Nsimd); - auto l_v = l.View(CpuWrite); + autoView( l_v , l, CpuWrite); extract(l_v[odx],buf); s = buf[idx]; @@ -173,7 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(CpuRead); + autoView( l_v , l, CpuRead); scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * pt = (scalar_type *)&s; @@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site){ idx= grid->iIndex(site); odx= grid->oIndex(site); - auto l_v = l.View(CpuWrite); + autoView( l_v , l, CpuWrite); scalar_type * vp = (scalar_type *)&l_v[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w inline Lattice adj(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(AcceleratorRead); - auto ret_v = ret.View(AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], adj(lhs_v(ss))); }); @@ -50,8 +50,8 @@ template inline Lattice adj(const Lattice &lhs){ template inline Lattice conjugate(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto lhs_v = lhs.View(AcceleratorRead); - auto ret_v = ret.View(AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index a3a1192d..16742947 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -79,11 +79,11 @@ template inline typename vobj::scalar_object sum(const Lattice &arg) { #if defined(GRID_CUDA)||defined(GRID_HIP) - auto arg_v = arg.View(AcceleratorRead); + autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); auto ssum= sum_gpu(&arg_v[0],osites); #else - auto arg_v = arg.View(CpuRead); + autoView(arg_v, arg, CpuRead); Integer osites = arg.Grid()->oSites(); auto ssum= sum_cpu(&arg_v[0],osites); #endif @@ -113,8 +113,8 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ const uint64_t sites = grid->oSites(); // Might make all code paths go this way. - auto left_v = left.View(AcceleratorRead); - auto right_v=right.View(AcceleratorRead); + autoView( left_v , left, AcceleratorRead); + autoView( right_v,right, AcceleratorRead); // GPU - SIMT lane compliance... typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; @@ -168,9 +168,9 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt const uint64_t sites = grid->oSites(); // GPU - auto x_v=x.View(AcceleratorRead); - auto y_v=y.View(AcceleratorRead); - auto z_v=z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; Vector inner_tmp(sites); @@ -257,7 +257,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction - auto Data_v=Data.View(CpuRead); + autoView( Data_v, Data, CpuRead); thread_for( r,rd, { int so=r*grid->_ostride[orthogdim]; // base offset for start of plane for(int n=0;n & result, const Latti int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - auto lhv=lhs.View(CpuRead); - auto rhv=rhs.View(CpuRead); + autoView( lhv, lhs, CpuRead); + autoView( rhv, rhs, CpuRead); thread_for( r,rd,{ int so=r*grid->_ostride[orthogdim]; // base offset for start of plane @@ -443,9 +443,9 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice tensor_reduced at; at=av; - auto Rv=R.View(CpuWrite); - auto Xv=X.View(CpuRead); - auto Yv=Y.View(CpuRead); + autoView( Rv, R, CpuWrite); + autoView( Xv, X, CpuRead); + autoView( Yv, Y, CpuRead); thread_for2d( n, e1, b,e2, { int ss= so+n*stride+b; Rv[ss] = at*Xv[ss]+Yv[ss]; @@ -501,9 +501,9 @@ static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto X_v=X.View(CpuRead); - auto Y_v=Y.View(CpuRead); - auto R_v=R.View(CpuWrite); + autoView( X_v, X, CpuRead); + autoView( Y_v, Y, CpuRead); + autoView( R_v, R, CpuWrite); thread_region { Vector s_x(Nblock); @@ -554,8 +554,8 @@ static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice< int block =FullGrid->_slice_block [Orthog]; int nblock=FullGrid->_slice_nblock[Orthog]; int ostride=FullGrid->_ostride[Orthog]; - auto R_v = R.View(CpuWrite); - auto X_v = X.View(CpuRead); + autoView( R_v, R, CpuWrite); + autoView( X_v, X, CpuRead); thread_region { std::vector s_x(Nblock); @@ -613,8 +613,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice typedef typename vobj::vector_typeD vector_typeD; - auto lhs_v=lhs.View(CpuRead); - auto rhs_v=rhs.View(CpuRead); + autoView( lhs_v, lhs, CpuRead); + autoView( rhs_v, rhs, CpuRead); thread_region { std::vector Left(Nblock); diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index e5da8d35..e5e63716 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -375,7 +375,7 @@ public: int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int words = sizeof(scalar_object) / sizeof(scalar_type); - auto l_v = l.View(CpuWrite); + autoView(l_v, l, CpuWrite); thread_for( ss, osites, { ExtractBuffer buf(Nsimd); for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times diff --git a/Grid/lattice/Lattice_trace.h b/Grid/lattice/Lattice_trace.h index 8d1f85bd..b5d80ccc 100644 --- a/Grid/lattice/Lattice_trace.h +++ b/Grid/lattice/Lattice_trace.h @@ -41,8 +41,8 @@ template inline auto trace(const Lattice &lhs) -> Lattice { Lattice ret(lhs.Grid()); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView(ret_v , ret, AcceleratorWrite); + autoView(lhs_v , lhs, AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], trace(lhs_v(ss))); }); @@ -56,8 +56,8 @@ template inline auto TraceIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView( ret_v , ret, AcceleratorWrite); + autoView( lhs_v , lhs, AcceleratorRead); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { coalescedWrite(ret_v[ss], traceIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 9e98d111..7362060a 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -46,11 +46,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine) //////////////////////////////////////////////////////////////////////////////////////////// // remove and insert a half checkerboard //////////////////////////////////////////////////////////////////////////////////////////// -template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full){ +template inline void pickCheckerboard(int cb,Lattice &half,const Lattice &full) +{ half.Checkerboard() = cb; - auto half_v = half.View(CpuWrite); - auto full_v = full.View(CpuRead); + autoView( half_v, half, CpuWrite); + autoView( full_v, full, CpuRead); thread_for(ss, full.Grid()->oSites(),{ int cbos; Coordinate coor; @@ -63,10 +64,11 @@ template inline void pickCheckerboard(int cb,Lattice &half,con } }); } -template inline void setCheckerboard(Lattice &full,const Lattice &half){ +template inline void setCheckerboard(Lattice &full,const Lattice &half) +{ int cb = half.Checkerboard(); - auto half_v = half.View(CpuRead); - auto full_v = full.View(CpuWrite); + autoView( half_v , half, CpuRead); + autoView( full_v , full, CpuWrite); thread_for(ss,full.Grid()->oSites(),{ Coordinate coor; @@ -92,79 +94,15 @@ inline void blockProject(Lattice > &coarseData, Lattice ip(coarse); - auto coarseData_ = coarseData.View(AcceleratorWrite); - auto ip_ = ip.View(AcceleratorWrite); + autoView( coarseData_ , coarseData, AcceleratorWrite); + autoView( ip_ , ip, AcceleratorWrite); for(int v=0;voSites(), vobj::Nsimd(), { coalescedWrite(coarseData_[sc](v),ip_(sc)); - }); + }); } } -#if 0 -template -inline void blockProject1(Lattice > &coarseData, - const Lattice &fineData, - const std::vector > &Basis) -{ - typedef iVector coarseSiteData; - coarseSiteData elide; - typedef decltype(coalescedRead(elide)) ScalarComplex; - GridBase * fine = fineData.Grid(); - GridBase * coarse= coarseData.Grid(); - int _ndimension = coarse->_ndimension; - - // checks - assert( nbasis == Basis.size() ); - subdivides(coarse,fine); - for(int i=0;i_rdimensions[d] / coarse->_rdimensions[d]; - assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); - } - int blockVol = fine->oSites()/coarse->oSites(); - - coarseData=Zero(); - - auto fineData_ = fineData.View(AcceleratorRead); - auto coarseData_ = coarseData.View(AcceleratorWrite); - //////////////////////////////////////////////////////////////////////////////////////////////////////// - // To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. - // Otherwise do fine inner product per site, and make the update atomic - //////////////////////////////////////////////////////////////////////////////////////////////////////// - accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), { - - auto sc=sci/nbasis; - auto i=sci%nbasis; - auto Basis_ = Basis[i].View(AcceleratorRead); - - Coordinate coor_c(_ndimension); - Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - - int sf; - decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); - - for(int sb=0;sb_rdimensions); - - reduce=reduce+innerProduct(Basis_(sf),fineData_(sf)); - } - coalescedWrite(coarseData_[sc](i),reduce); - }); - return; -} -#endif template inline void blockZAXPY(Lattice &fineZ, @@ -191,10 +129,10 @@ inline void blockZAXPY(Lattice &fineZ, assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } - auto fineZ_ = fineZ.View(AcceleratorWrite); - auto fineX_ = fineX.View(AcceleratorRead); - auto fineY_ = fineY.View(AcceleratorRead); - auto coarseA_= coarseA.View(AcceleratorRead); + autoView( fineZ_ , fineZ, AcceleratorWrite); + autoView( fineX_ , fineX, AcceleratorRead); + autoView( fineY_ , fineY, AcceleratorRead); + autoView( coarseA_, coarseA, AcceleratorRead); accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { @@ -229,8 +167,8 @@ inline void blockInnerProduct(Lattice &CoarseInner, // Precision promotion? fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); - auto CoarseInner_ = CoarseInner.View(AcceleratorWrite); - auto coarse_inner_ = coarse_inner.View(AcceleratorRead); + autoView( CoarseInner_ , CoarseInner, AcceleratorWrite); + autoView( coarse_inner_ , coarse_inner, AcceleratorRead); accelerator_for(ss, coarse->oSites(), 1, { CoarseInner_[ss] = coarse_inner_[ss]; }); @@ -265,8 +203,8 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) // Turn this around to loop threaded over sc and interior loop // over sf would thread better - auto coarseData_ = coarseData.View(AcceleratorWrite); - auto fineData_ = fineData.View(AcceleratorRead); + autoView( coarseData_ , coarseData, AcceleratorWrite); + autoView( fineData_ , fineData, AcceleratorRead); accelerator_for(sc,coarse->oSites(),1,{ @@ -359,8 +297,8 @@ inline void blockPromote(const Lattice > &coarseData, for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } - auto fineData_ = fineData.View(AcceleratorWrite); - auto coarseData_ = coarseData.View(AcceleratorRead); + autoView( fineData_ , fineData, AcceleratorWrite); + autoView( coarseData_ , coarseData, AcceleratorRead); // Loop with a cache friendly loop ordering accelerator_for(sf,fine->oSites(),1,{ @@ -373,7 +311,7 @@ inline void blockPromote(const Lattice > &coarseData, Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); for(int i=0;i > &coarseData, for(int i=0;i > ip = PeekIndex<0>(coarseData,i); Lattice cip(coarse); - auto cip_ = cip.View(AcceleratorWrite); - auto ip_ = ip.View(AcceleratorRead); + autoView( cip_ , cip, AcceleratorWrite); + autoView( ip_ , ip, AcceleratorRead); accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ coalescedWrite(cip_[sc], ip_(sc)()); }); @@ -469,8 +407,8 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; - auto t_v = To.View(AcceleratorWrite); - auto f_v = From.View(AcceleratorRead); + autoView( t_v , To, AcceleratorWrite); + autoView( f_v , From, AcceleratorRead); accelerator_for(idx,Fg->lSites(),1,{ sobj s; Coordinate Fcoor(nd); @@ -717,7 +655,7 @@ unvectorizeToLexOrdArray(std::vector &out, const Lattice &in) } //loop over outer index - auto in_v = in.View(CpuRead); + autoView( in_v , in, CpuRead); thread_for(in_oidx,in_grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray out_ptrs(in_nsimd); @@ -810,7 +748,7 @@ vectorizeFromLexOrdArray( std::vector &in, Lattice &out) icoor[lane].resize(ndim); grid->iCoorFromIindex(icoor[lane],lane); } - auto out_v = out.View(CpuWrite); + autoView( out_v , out, CpuWrite); thread_for(oidx, grid->oSites(),{ //Assemble vector of pointers to output elements ExtractPointerArray ptrs(nsimd); @@ -913,7 +851,7 @@ void precisionChange(Lattice &out, const Lattice &in) std::vector in_slex_conv(in_grid->lSites()); unvectorizeToLexOrdArray(in_slex_conv, in); - auto out_v = out.View(CpuWrite); + autoView( out_v , out, CpuWrite); thread_for(out_oidx,out_grid->oSites(),{ Coordinate out_ocoor(ndim); out_grid->oCoorFromOindex(out_ocoor, out_oidx); diff --git a/Grid/lattice/Lattice_transpose.h b/Grid/lattice/Lattice_transpose.h index c17a808b..b11175dd 100644 --- a/Grid/lattice/Lattice_transpose.h +++ b/Grid/lattice/Lattice_transpose.h @@ -41,8 +41,8 @@ NAMESPACE_BEGIN(Grid); template inline Lattice transpose(const Lattice &lhs){ Lattice ret(lhs.Grid()); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); }); @@ -56,8 +56,8 @@ template inline auto TransposeIndex(const Lattice &lhs) -> Lattice(vobj()))> { Lattice(vobj()))> ret(lhs.Grid()); - auto ret_v = ret.View(AcceleratorWrite); - auto lhs_v = lhs.View(AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + autoView( lhs_v, lhs, AcceleratorRead); accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ coalescedWrite(ret_v[ss] , transposeIndex(lhs_v(ss))); }); diff --git a/Grid/lattice/Lattice_unary.h b/Grid/lattice/Lattice_unary.h index 10aa7472..07424b3d 100644 --- a/Grid/lattice/Lattice_unary.h +++ b/Grid/lattice/Lattice_unary.h @@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid); template Lattice pow(const Lattice &rhs_i,RealD y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(AcceleratorRead); - auto ret = ret_i.View(AcceleratorWrite); + autoView( rhs, rhs_i, AcceleratorRead); + autoView( ret, ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),1,{ ret[ss]=pow(rhs[ss],y); @@ -45,8 +45,8 @@ template Lattice pow(const Lattice &rhs_i,RealD y){ } template Lattice mod(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(AcceleratorRead); - auto ret = ret_i.View(AcceleratorWrite); + autoView( rhs , rhs_i, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],mod(rhs(ss),y)); @@ -56,8 +56,8 @@ template Lattice mod(const Lattice &rhs_i,Integer y){ template Lattice div(const Lattice &rhs_i,Integer y){ Lattice ret_i(rhs_i.Grid()); - auto ret = ret_i.View(AcceleratorWrite); - auto rhs = rhs_i.View(AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); + autoView( rhs , rhs_i, AcceleratorRead); ret.Checkerboard() = rhs_i.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],div(rhs(ss),y)); @@ -67,8 +67,8 @@ template Lattice div(const Lattice &rhs_i,Integer y){ template Lattice expMat(const Lattice &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ Lattice ret_i(rhs_i.Grid()); - auto rhs = rhs_i.View(AcceleratorRead); - auto ret = ret_i.View(AcceleratorWrite); + autoView( rhs , rhs_i, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); ret.Checkerboard() = rhs.Checkerboard(); accelerator_for(ss,rhs.size(),obj::Nsimd(),{ coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index b12dd2b7..d21ab874 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -25,6 +25,7 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) template class LatticeAccelerator : public LatticeBase { protected: + //public: GridBase *_grid; int checkerboard; vobj *_odata; // A managed pointer @@ -47,7 +48,7 @@ public: // The copy constructor for this will need to be used by device lambda functions ///////////////////////////////////////////////////////////////////////////////////////// template -class LatticeExprView : public LatticeAccelerator +class LatticeView : public LatticeAccelerator { public: // Rvalue @@ -68,7 +69,12 @@ public: accelerator_inline uint64_t end(void) const { return this->_odata_size; }; accelerator_inline uint64_t size(void) const { return this->_odata_size; }; - LatticeExprView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me){} + LatticeView(const LatticeAccelerator &refer_to_me) : LatticeAccelerator (refer_to_me){} + LatticeView(const LatticeView &refer_to_me) = default; // Trivially copyable + LatticeView(const LatticeAccelerator &refer_to_me,ViewMode mode) : LatticeAccelerator (refer_to_me) + { + this->ViewOpen(mode); + } // Host functions void ViewOpen(ViewMode mode) @@ -89,46 +95,20 @@ public: } }; - - -/////////////////////////////////////////////////////////////////////// -// An object to be stored in a shared_ptr to clean up after last view. -// UserView constructor,destructor updates view manager -// Non-copyable object??? Second base with copy/= deleted? -/////////////////////////////////////////////////////////////////////// -class MemViewDeleter { - public: - void *cpu_ptr; - ViewMode mode; - ~MemViewDeleter(){ - MemoryManager::ViewClose(cpu_ptr,mode); - } -}; -template -class LatticeView : public LatticeExprView +// Little autoscope assister +template +class ViewCloser { -#ifndef GRID_UVM - std::shared_ptr Deleter; -#endif -public: -#ifdef GRID_UVM - LatticeView(const LatticeAccelerator &refer_to_me,ViewMode mode) : - LatticeExprView (refer_to_me) - { - } -#else - LatticeView(const LatticeView &orig) : LatticeExprView(orig) { } - LatticeView(const LatticeAccelerator &refer_to_me,ViewMode mode) : - LatticeExprView (refer_to_me), Deleter(new MemViewDeleter) - { - // std::cout << "FIXME - copy shared pointer? View Open in LatticeView"<_odata<ViewOpen(mode); - Deleter->cpu_ptr = this->cpu_ptr; - Deleter->mode = mode; - } -#endif + View v; // Take a copy of view and call view close when I go out of scope automatically + public: + ViewCloser(View &_v) : v(_v) {}; + ~ViewCloser() { v.ViewClose(); } }; +#define autoView(l_v,l,mode) \ + auto l_v = l.View(mode); \ + ViewCloser _autoView##l_v(l_v); + ///////////////////////////////////////////////////////////////////////////////////////// // Lattice expression types used by ET to assemble the AST // @@ -142,7 +122,7 @@ template using is_lattice = std::is_base_of; template using is_lattice_expr = std::is_base_of; template struct ViewMapBase { typedef T Type; }; -template struct ViewMapBase { typedef LatticeExprView Type; }; +template struct ViewMapBase { typedef LatticeView Type; }; template using ViewMap = ViewMapBase::value >; template diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index a8ae90ec..0b726db9 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -232,15 +232,17 @@ public: if ( Params.twists[mu] ) { Uconj = where(coor==neglink,-Uconj,Uconj); } - - auto U_v = U.View(CpuRead); - auto Uds_v = Uds.View(CpuWrite); - auto Uconj_v = Uconj.View(CpuRead); - auto Utmp_v= Utmp.View(CpuWrite); - thread_foreach(ss,U_v,{ - Uds_v[ss](0)(mu) = U_v[ss](); - Uds_v[ss](1)(mu) = Uconj_v[ss](); - }); + + { + autoView( U_v , U, CpuRead); + autoView( Uconj_v , Uconj, CpuRead); + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,U_v,{ + Uds_v[ss](0)(mu) = U_v[ss](); + Uds_v[ss](1)(mu) = Uconj_v[ss](); + }); + } U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary Uconj = adj(Cshift(Uconj,mu,-1)); @@ -250,19 +252,25 @@ public: Utmp = where(coor==0,Uconj,Utmp); } - thread_foreach(ss,Utmp_v,{ - Uds_v[ss](0)(mu+4) = Utmp_v[ss](); - }); - + { + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,Utmp_v,{ + Uds_v[ss](0)(mu+4) = Utmp_v[ss](); + }); + } Utmp = Uconj; if ( Params.twists[mu] ) { Utmp = where(coor==0,U,Utmp); } - - thread_foreach(ss,Utmp_v,{ - Uds_v[ss](1)(mu+4) = Utmp_v[ss](); - }); - + + { + autoView( Uds_v , Uds, CpuWrite); + autoView( Utmp_v, Utmp, CpuWrite); + thread_foreach(ss,Utmp_v,{ + Uds_v[ss](1)(mu+4) = Utmp_v[ss](); + }); + } } } @@ -272,11 +280,14 @@ public: GaugeLinkField link(mat.Grid()); // use lorentz for flavour as hack. auto tmp = TraceIndex(outerProduct(Btilde, A)); - auto link_v = link.View(CpuWrite); - auto tmp_v = tmp.View(CpuRead); - thread_foreach(ss,tmp_v,{ - link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); - }); + + { + autoView( link_v , link, CpuWrite); + autoView( tmp_v , tmp, CpuRead); + thread_foreach(ss,tmp_v,{ + link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); + }); + } PokeIndex(mat, link, mu); return; } @@ -306,16 +317,18 @@ public: GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(CpuWrite); - auto Atilde_v = Atilde.View(CpuRead); - auto Btilde_v = Btilde.View(CpuRead); - thread_for(ss,tmp.Grid()->oSites(),{ - for (int s = 0; s < Ls; s++) { - int sF = s + Ls * ss; - auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); - tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); - } - }); + { + autoView( tmp_v , tmp, CpuWrite); + autoView( Atilde_v , Atilde, CpuRead); + autoView( Btilde_v , Btilde, CpuRead); + thread_for(ss,tmp.Grid()->oSites(),{ + for (int s = 0; s < Ls; s++) { + int sF = s + Ls * ss; + auto ttmp = traceIndex(outerProduct(Btilde_v[sF], Atilde_v[sF])); + tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); + } + }); + } PokeIndex(mat, tmp, mu); return; } diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 05143551..aa8fb150 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -264,8 +264,8 @@ private: { CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(CpuWrite); - auto F_v = F.View(CpuRead); + autoView(T_v,T,CpuWrite); + autoView(F_v,F,CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); @@ -282,8 +282,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(CpuWrite); - auto F_v = F.View(CpuRead); + autoView(T_v, T,CpuWrite); + autoView(F_v, F,CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = -F_v[i]()(); @@ -300,8 +300,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(CpuWrite); - auto F_v = F.View(CpuRead); + autoView(T_v,T,CpuWrite); + autoView(F_v,F,CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); @@ -318,8 +318,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(CpuWrite); - auto F_v = F.View(CpuRead); + autoView( T_v , T, CpuWrite); + autoView( F_v , F, CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = timesI(F_v[i]()()); @@ -336,8 +336,8 @@ private: CloverFieldType T(F.Grid()); T = Zero(); - auto T_v = T.View(CpuWrite); - auto F_v = F.View(CpuRead); + autoView( T_v ,T,CpuWrite); + autoView( F_v ,F,CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 1) = -(F_v[i]()()); @@ -355,8 +355,8 @@ private: T = Zero(); - auto T_v = T.View(CpuWrite); - auto F_v = F.View(CpuRead); + autoView( T_v , T,CpuWrite); + autoView( F_v , F,CpuRead); thread_for(i, CloverTerm.Grid()->oSites(), { T_v[i]()(0, 0) = timesI(F_v[i]()()); diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 356d0941..b4afc69a 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -106,9 +106,9 @@ public: const _SpinorField & phi, int mu) { - auto out_v= out.View(CpuWrite); - auto phi_v= phi.View(CpuRead); - auto Umu_v= Umu.View(CpuRead); + autoView( out_v, out, CpuWrite); + autoView( phi_v, phi, CpuRead); + autoView( Umu_v, Umu, CpuRead); thread_for(sss,out.Grid()->oSites(),{ multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); }); @@ -191,18 +191,19 @@ public: int Ls=Btilde.Grid()->_fdimensions[0]; GaugeLinkField tmp(mat.Grid()); tmp = Zero(); - auto tmp_v = tmp.View(CpuWrite); - auto Btilde_v = Btilde.View(CpuRead); - auto Atilde_v = Atilde.View(CpuRead); - thread_for(sss,tmp.Grid()->oSites(),{ - int sU=sss; - for(int s=0;s(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here - } - }); + { + autoView( tmp_v , tmp, CpuWrite); + autoView( Btilde_v , Btilde, CpuRead); + autoView( Atilde_v , Atilde, CpuRead); + thread_for(sss,tmp.Grid()->oSites(),{ + int sU=sss; + for(int s=0;s(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here + } + }); + } PokeIndex(mat,tmp,mu); - } }; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 72940cda..d2537ccf 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -50,9 +50,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(AcceleratorRead); - auto phi = phi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -93,9 +93,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(AcceleratorRead); - auto phi = phi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; @@ -131,8 +131,8 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); int Ls=this->Ls; @@ -193,8 +193,8 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi GridBase *grid=psi_i.Grid(); int Ls=this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); auto plee = & lee [0]; auto pdee = & dee [0]; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h index 079ea481..b54f63ad 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h @@ -65,9 +65,9 @@ CayleyFermion5D::M5D(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi = psi_i.View(CpuRead); - auto phi = phi_i.View(CpuRead); - auto chi = chi_i.View(CpuWrite); + autoView(psi, psi_i,CpuRead); + autoView(phi, phi_i,CpuRead); + autoView(chi, chi_i,CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; const int nsimd= Simd::Nsimd(); @@ -213,9 +213,9 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, EnableIf sfinae=0; chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); - auto psi=psi_i.View(CpuRead); - auto phi=phi_i.View(CpuRead); - auto chi=chi_i.View(CpuWrite); + autoView(psi,psi_i,CpuRead); + autoView(phi,phi_i,CpuRead); + autoView(chi,chi_i,CpuWrite); int Ls = this->Ls; int LLs = grid->_rdimensions[0]; int nsimd= Simd::Nsimd(); @@ -357,8 +357,8 @@ CayleyFermion5D::MooeeInternalAsm(const FermionField &psi_i, FermionField Vector > &Matm) { EnableIf sfinae=0; - auto psi = psi_i.View(CpuRead); - auto chi = chi_i.View(CpuWrite); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); #ifndef AVX512 { SiteHalfSpinor BcastP; @@ -535,8 +535,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField EnableIf sfinae=0; #ifndef AVX512 { - auto psi = psi_i.View(CpuRead); - auto chi = chi_i.View(CpuWrite); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); SiteHalfSpinor BcastP; SiteHalfSpinor BcastM; @@ -586,8 +586,8 @@ CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi_i, FermionField } #else { - auto psi = psi_i.View(CpuRead); - auto chi = chi_i.View(CpuWrite); + autoView(psi , psi_i,CpuRead); + autoView(chi , chi_i,CpuWrite); // pointers // MASK_REGS; #define Chi_00 %zmm0 diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 100eb0d2..9a8454ef 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -46,9 +46,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; GridBase* grid = psi_i.Grid(); - auto phi = phi_i.View(AcceleratorRead); - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView( phi , phi_i, AcceleratorRead); + autoView( psi , psi_i, AcceleratorRead); + autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -82,9 +82,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio GridBase* grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto phi = phi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView( psi , psi_i, AcceleratorRead); + autoView( phi , phi_i, AcceleratorRead); + autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); auto pdiag = &diag[0]; auto pupper = &upper[0]; @@ -116,8 +116,8 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi=psi_i.View(AcceleratorRead); - auto chi=chi_i.View(AcceleratorWrite); + autoView( psi, psi_i, AcceleratorRead); + autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; @@ -172,8 +172,8 @@ void DomainWallEOFAFermion::MooeeInvDag(const FermionField& psi_i, Fermion { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView( psi, psi_i, AcceleratorRead); + autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; auto plee = & this->lee[0]; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 58d2b368..87acca0e 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D::DhopDir(const FermionField &in, FermionFi Compressor compressor; Stencil.HaloExchange(in,compressor); - auto Umu_v = Umu.View(CpuRead); - auto UUUmu_v = UUUmu.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); + autoView( Umu_v , Umu, CpuRead); + autoView( UUUmu_v , UUUmu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); thread_for( ss,Umu.Grid()->oSites(),{ for(int s=0;s::DerivInternal(StencilImpl &st, DoubledGauge //////////////////////// // Call the single hop //////////////////////// - auto U_v = U.View(CpuRead); - auto UUU_v = UUU.View(CpuRead); - auto B_v = B.View(CpuWrite); - auto Btilde_v = Btilde.View(CpuWrite); + autoView( U_v , U, CpuRead); + autoView( UUU_v , UUU, CpuRead); + autoView( B_v , B, CpuWrite); + autoView( Btilde_v , Btilde, CpuWrite); thread_for(sss,B.Grid()->oSites(),{ Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); }); @@ -378,10 +378,10 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel Compressor compressor; Stencil.HaloExchange(in, compressor); - auto Umu_v = Umu.View(CpuRead); - auto UUUmu_v = UUUmu.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); + autoView( Umu_v , Umu, CpuRead); + autoView( UUUmu_v , UUUmu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); thread_for( sss, in.Grid()->oSites(),{ Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); }); diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index ed7be056..41b9170d 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -44,9 +44,9 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto phi = phi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -84,9 +84,9 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto phi = phi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto pm = this->pm; int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator @@ -132,9 +132,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto phi = phi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -174,9 +174,9 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm GridBase *grid = psi_i.Grid(); int Ls = this->Ls; int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator - auto psi = psi_i.View(AcceleratorRead); - auto phi = phi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(phi , phi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); @@ -226,8 +226,8 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -286,8 +286,8 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto pm = this->pm; auto plee = & this->lee [0]; @@ -354,8 +354,8 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); int Ls = this->Ls; - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); auto plee = & this->lee [0]; auto pdee = & this->dee [0]; @@ -410,8 +410,8 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); - auto psi = psi_i.View(AcceleratorRead); - auto chi = chi_i.View(AcceleratorWrite); + autoView(psi , psi_i, AcceleratorRead); + autoView(chi , chi_i, AcceleratorWrite); int Ls = this->Ls; auto pm = this->pm; diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h index ccd36f57..49696aa7 100644 --- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -208,9 +208,9 @@ void NaiveStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGaugeFie //////////////////////// // Call the single hop //////////////////////// - auto U_v = U.View(CpuRead); - auto B_v = B.View(CpuWrite); - auto Btilde_v = Btilde.View(CpuWrite); + autoView( U_v , U, CpuRead); + autoView( B_v , B, CpuWrite); + autoView( Btilde_v , Btilde, CpuWrite); thread_for(sss,B.Grid()->oSites(),{ Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); }); @@ -315,9 +315,9 @@ void NaiveStaggeredFermion::DhopDir(const FermionField &in, FermionField & Compressor compressor; Stencil.HaloExchange(in, compressor); - auto Umu_v = Umu.View(CpuRead); - auto in_v = in.View(CpuRead); - auto out_v = out.View(CpuWrite); + autoView( Umu_v , Umu, CpuRead); + autoView( in_v , in, CpuRead); + autoView( out_v , out, CpuWrite); // thread_for( sss, in.Grid()->oSites(),{ // Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); // }); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index d7abef27..141725a7 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -261,11 +261,11 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; - auto UUU_v = UUU.View(AcceleratorRead); - auto U_v = U.View(AcceleratorRead); - auto in_v = in.View(AcceleratorRead); - auto out_v = out.View(AcceleratorWrite); - auto st_v = st.View(AcceleratorRead); + autoView( UUU_v , UUU, AcceleratorRead); + autoView( U_v , U, AcceleratorRead); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); SiteSpinor * buf = st.CommBuf(); int Ls=1; @@ -301,11 +301,11 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; - auto UUU_v= U.View(AcceleratorRead); - auto U_v = U.View(AcceleratorRead); - auto in_v = in.View(AcceleratorRead); - auto out_v = out.View(AcceleratorWrite); - auto st_v = st.View(AcceleratorRead); + autoView( UUU_v , U, AcceleratorRead); + autoView( U_v , U, AcceleratorRead); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); SiteSpinor * buf = st.CommBuf(); int Ls=1; diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 9e492831..3db59b1d 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -475,12 +475,12 @@ void WilsonFermion::ContractConservedCurrent(PropagatorField &q_in_1, // Inefficient comms method but not performance critical. tmp1 = Cshift(q_in_1, mu, 1); tmp2 = Cshift(q_in_2, mu, 1); - auto tmp1_v = tmp1.View(CpuWrite); - auto tmp2_v = tmp2.View(CpuWrite); - auto q_in_1_v=q_in_1.View(CpuRead); - auto q_in_2_v=q_in_2.View(CpuRead); - auto q_out_v = q_out.View(CpuRead); - auto Umu_v = Umu.View(CpuRead); + autoView( tmp1_v , tmp1, CpuWrite); + autoView( tmp2_v , tmp2, CpuWrite); + autoView( q_in_1_v,q_in_1, CpuRead); + autoView( q_in_2_v,q_in_2, CpuRead); + autoView( q_out_v , q_out, CpuRead); + autoView( Umu_v , Umu, CpuRead); thread_for(sU, Umu.Grid()->oSites(),{ Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU], q_in_2_v[sU], @@ -526,11 +526,11 @@ void WilsonFermion::SeqConservedCurrent(PropagatorField &q_in, tmp = lattice_cmplx*q_in; tmpBwd = Cshift(tmp, mu, -1); - auto coords_v = coords.View(CpuRead); - auto tmpFwd_v = tmpFwd.View(CpuRead); - auto tmpBwd_v = tmpBwd.View(CpuRead); - auto Umu_v = Umu.View(CpuRead); - auto q_out_v = q_out.View(CpuWrite); + autoView( coords_v , coords, CpuRead); + autoView( tmpFwd_v , tmpFwd, CpuRead); + autoView( tmpBwd_v , tmpBwd, CpuRead); + autoView( Umu_v , Umu, CpuRead); + autoView( q_out_v , q_out, CpuWrite); thread_for(sU, Umu.Grid()->oSites(), { diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 8f8c1063..603be7ec 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -348,18 +348,18 @@ template void WilsonKernels::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, std::vector &out) { - auto U_v = U.View(AcceleratorRead); - auto in_v = in.View(AcceleratorRead); - auto st_v = st.View(AcceleratorRead); + autoView(U_v ,U,AcceleratorRead); + autoView(in_v ,in,AcceleratorRead); + autoView(st_v ,st,AcceleratorRead); - auto out_Xm = out[0].View(AcceleratorWrite); - auto out_Ym = out[1].View(AcceleratorWrite); - auto out_Zm = out[2].View(AcceleratorWrite); - auto out_Tm = out[3].View(AcceleratorWrite); - auto out_Xp = out[4].View(AcceleratorWrite); - auto out_Yp = out[5].View(AcceleratorWrite); - auto out_Zp = out[6].View(AcceleratorWrite); - auto out_Tp = out[7].View(AcceleratorWrite); + autoView(out_Xm,out[0],AcceleratorWrite); + autoView(out_Ym,out[1],AcceleratorWrite); + autoView(out_Zm,out[2],AcceleratorWrite); + autoView(out_Tm,out[3],AcceleratorWrite); + autoView(out_Xp,out[4],AcceleratorWrite); + autoView(out_Yp,out[5],AcceleratorWrite); + autoView(out_Zp,out[6],AcceleratorWrite); + autoView(out_Tp,out[7],AcceleratorWrite); auto CBp=st.CommBuf(); accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ int sU=sss/Ls; @@ -383,10 +383,10 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S assert(dirdisp<=7); assert(dirdisp>=0); - auto U_v = U.View(AcceleratorRead); - auto in_v = in.View(AcceleratorRead); - auto out_v = out.View(AcceleratorWrite); - auto st_v = st.View(AcceleratorRead); + autoView(U_v ,U ,AcceleratorRead); + autoView(in_v ,in ,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v ,st ,AcceleratorRead); auto CBp=st.CommBuf(); #define LoopBody(Dir) \ case Dir : \ @@ -438,10 +438,10 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { - auto U_v = U.View(AcceleratorRead); - auto in_v = in.View(AcceleratorRead); - auto out_v = out.View(AcceleratorWrite); - auto st_v = st.View(AcceleratorRead); + autoView(U_v , U,AcceleratorRead); + autoView(in_v , in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v , st,AcceleratorRead); if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} @@ -469,10 +469,10 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField int Ls, int Nsite, const FermionField &in, FermionField &out, int interior,int exterior) { - auto U_v = U.View(AcceleratorRead); - auto in_v = in.View(AcceleratorRead); - auto out_v = out.View(AcceleratorWrite); - auto st_v = st.View(AcceleratorRead); + autoView(U_v ,U,AcceleratorRead); + autoView(in_v ,in,AcceleratorRead); + autoView(out_v,out,AcceleratorWrite); + autoView(st_v ,st,AcceleratorRead); if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 79549dcb..1368667e 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -86,8 +86,8 @@ public: // Move this elsewhere? FIXME static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W - auto U_v = U.View(CpuWrite); - auto W_v = W.View(CpuRead); + autoView(U_v,U,CpuWrite); + autoView(W_v,W,CpuRead); thread_for( ss, U.Grid()->oSites(), { U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); }); @@ -131,15 +131,15 @@ public: //static std::chrono::duration diff; //auto start = std::chrono::high_resolution_clock::now(); - auto U_v = U.View(CpuWrite); - auto P_v = P.View(CpuRead); + autoView(U_v,U,CpuWrite); + autoView(P_v,P,CpuRead); thread_for(ss, P.Grid()->oSites(),{ for (int mu = 0; mu < Nd; mu++) { U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); } }); - //auto end = std::chrono::high_resolution_clock::now(); + //auto end = std::chrono::high_resolution_clock::now(); // diff += end - start; // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; } diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h index 7ac85d56..5a5f9251 100644 --- a/Grid/qcd/action/scalar/ScalarInteractionAction.h +++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h @@ -89,8 +89,8 @@ public: action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared; - auto p_v = p.View(CpuRead); - auto action_v = action.View(CpuWrite); + autoView( p_v , p, CpuRead); + autoView( action_v , action, CpuWrite); for (int mu = 0; mu < Ndim; mu++) { // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils @@ -146,8 +146,8 @@ public: for (int point = 0; point < npoint; point++) { - auto p_v = p.View(CpuRead); - auto force_v = force.View(CpuWrite); + autoView( p_v , p, CpuRead); + autoView( force_v , force, CpuWrite); int permute_type; StencilEntry *SE; diff --git a/Grid/qcd/modules/Registration.h b/Grid/qcd/modules/Registration.h index ec28f020..c1149b83 100644 --- a/Grid/qcd/modules/Registration.h +++ b/Grid/qcd/modules/Registration.h @@ -81,7 +81,7 @@ static Registrar, static Registrar< ConjugateGradientModule, HMC_SolverModuleFactory > __CGWFmodXMLInit("ConjugateGradient"); static Registrar< BiCGSTABModule, - HMC_SolverModuleFactory > __CGWFmodXMLInit("BiCGSTAB"); + HMC_SolverModuleFactory > __BiCGWFmodXMLInit("BiCGSTAB"); static Registrar< ConjugateResidualModule, HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index 7ad496b7..b63d8571 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -185,13 +185,14 @@ void A2Autils::MesonField(TensorType &mat, for(int i=0;i::MesonField(TensorType &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::PionFieldXX(Eigen::Tensor &mat, for(int i=0;i::PionFieldXX(Eigen::Tensor &mat, } for(int j=0;j::PionFieldWVmom(Eigen::Tensor &mat, for(int i=0;i::PionFieldWVmom(Eigen::Tensor &mat, int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; for ( int m=0;m::AslashField(TensorType &mat, for(int i=0;i::AslashField(TensorType &mat, for ( int m=0;m::ContractWWVV(std::vector &WWVV, for(int d_o=0;d_o::ContractWWVV(std::vector &WWVV, thread_for(ss,grid->oSites(),{ for(int d_o=0;d_o::OuterProductWWVV(PropagatorField &WWVV, const vobj &rhs, const int Ns, const int ss) { - auto WWVV_v = WWVV.View(CpuWrite); + autoView(WWVV_v,WWVV,CpuWrite); for (int s1 = 0; s1 < Ns; s1++){ for (int s2 = 0; s2 < Ns; s2++){ WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0); @@ -1122,10 +1121,10 @@ void A2Autils::ContractFourQuarkColourDiagonal(const PropagatorField &WWV GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(CpuRead); - auto WWVV1_v = WWVV1.View(CpuRead); - auto O_trtr_v= O_trtr.View(CpuWrite); - auto O_fig8_v= O_fig8.View(CpuWrite); + autoView(WWVV0_v , WWVV0,CpuRead); + autoView(WWVV1_v , WWVV1,CpuRead); + autoView(O_trtr_v, O_trtr,CpuWrite); + autoView(O_fig8_v, O_fig8,CpuWrite); thread_for(ss,grid->oSites(),{ typedef typename ComplexField::vector_object vobj; @@ -1166,10 +1165,10 @@ void A2Autils::ContractFourQuarkColourMix(const PropagatorField &WWVV0, GridBase *grid = WWVV0.Grid(); - auto WWVV0_v = WWVV0.View(CpuRead); - auto WWVV1_v = WWVV1.View(CpuRead); - auto O_trtr_v= O_trtr.View(CpuWrite); - auto O_fig8_v= O_fig8.View(CpuWrite); + autoView( WWVV0_v , WWVV0,CpuRead); + autoView( WWVV1_v , WWVV1,CpuRead); + autoView( O_trtr_v, O_trtr,CpuWrite); + autoView( O_fig8_v, O_fig8,CpuWrite); thread_for(ss,grid->oSites(),{ diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index d45fd93d..32beac9c 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -273,10 +273,10 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, for (int ie=0; ie < 6 ; ie++) wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - auto vbaryon_corr= baryon_corr.View(CpuWrite); - auto v1 = q1_left.View(CpuRead); - auto v2 = q2_left.View(CpuRead); - auto v3 = q3_left.View(CpuRead); + autoView(vbaryon_corr, baryon_corr,CpuWrite); + autoView( v1 , q1_left, CpuRead); + autoView( v2 , q2_left, CpuRead); + autoView( v3 , q3_left, CpuRead); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ @@ -560,10 +560,10 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, { GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(CpuWrite); - auto vq_loop = qq_loop.View(CpuRead); - auto vd_tf = qd_tf.View(CpuRead); - auto vs_ti = qs_ti.View(CpuRead); + autoView( vcorr, stn_corr, CpuWrite); + autoView( vq_loop , qq_loop, CpuRead); + autoView( vd_tf , qd_tf, CpuRead); + autoView( vs_ti , qs_ti, CpuRead); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ @@ -597,12 +597,11 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, { GridBase *grid = qs_ti.Grid(); - auto vcorr= stn_corr.View(CpuWrite); - auto vq_ti = qq_ti.View(CpuRead); - auto vq_tf = qq_tf.View(CpuRead); - auto vd_tf = qd_tf.View(CpuRead); - auto vs_ti = qs_ti.View(CpuRead); - + autoView( vcorr , stn_corr, CpuWrite); + autoView( vq_ti , qq_ti, CpuRead); + autoView( vq_tf , qq_tf, CpuRead); + autoView( vd_tf , qd_tf, CpuRead); + autoView( vs_ti , qs_ti, CpuRead); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ auto Dq_ti = vq_ti[ss]; diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h index 0adbfabf..1e016e4e 100644 --- a/Grid/qcd/utils/LinalgUtils.h +++ b/Grid/qcd/utils/LinalgUtils.h @@ -47,8 +47,8 @@ void axpibg5x(Lattice &z,const Lattice &x,Coeff a,Coeff b) GridBase *grid=x.Grid(); Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView(x_v, x, AcceleratorRead); + autoView(z_v, z, AcceleratorWrite); accelerator_for( ss, x_v.size(),vobj::Nsimd(), { auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss))); coalescedWrite(z_v[ss],tmp); @@ -63,9 +63,9 @@ void axpby_ssp(Lattice &z, Coeff a,const Lattice &x,Coeff b,const La conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); // FIXME -- need a new class of accelerator_loop to implement this // uint64_t nloop = grid->oSites()/Ls; @@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const L conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice &z,Coeff a,const Lattice &x,Coeff b,const GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); Gamma G5(Gamma::Algebra::Gamma5); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ @@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice &z,Coeff a,const Lattice &x,Coeff b,con conformable(x,z); GridBase *grid=x.Grid(); int Ls = grid->_rdimensions[0]; - auto x_v = x.View(AcceleratorRead); - auto y_v = y.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( y_v, y, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -189,8 +189,8 @@ void G5R5(Lattice &z,const Lattice &x) conformable(x,z); int Ls = grid->_rdimensions[0]; Gamma G5(Gamma::Algebra::Gamma5); - auto x_v = x.View(AcceleratorRead); - auto z_v = z.View(AcceleratorWrite); + autoView( x_v, x, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; @@ -222,8 +222,8 @@ void G5C(Lattice> &z, const LatticeoSites(),CComplex::Nsimd(), { for(int n = 0; n < nb; ++n) { diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 5f98f926..476c3d40 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -222,10 +222,10 @@ public: conformable(subgroup, Determinant); int i0, i1; su2SubGroupIndex(i0, i1, su2_index); - auto subgroup_v = subgroup.View(CpuWrite); - auto source_v = source.View(CpuRead); - auto Determinant_v = Determinant.View(CpuWrite); + autoView( subgroup_v , subgroup,CpuWrite); + autoView( source_v , source,CpuRead); + autoView( Determinant_v , Determinant,CpuWrite); thread_for(ss, grid->oSites(), { subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0); @@ -257,8 +257,8 @@ public: su2SubGroupIndex(i0, i1, su2_index); dest = 1.0; // start out with identity - auto dest_v = dest.View(CpuWrite); - auto subgroup_v = subgroup.View(CpuRead); + autoView( dest_v , dest, CpuWrite); + autoView( subgroup_v, subgroup, CpuRead); thread_for(ss, grid->oSites(), { dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0); @@ -266,6 +266,7 @@ public: dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0); dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1); }); + } /////////////////////////////////////////////// @@ -608,8 +609,8 @@ public: // reunitarise?? template - static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, - double scale = 1.0) { + static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0) + { GridBase *grid = out.Grid(); typedef typename LatticeMatrixType::vector_type vector_type; @@ -618,8 +619,7 @@ public: typedef iSinglet vTComplexType; typedef Lattice LatticeComplexType; - typedef typename GridTypeMapper< - typename LatticeMatrixType::vector_object>::scalar_object MatrixType; + typedef typename GridTypeMapper::scalar_object MatrixType; LatticeComplexType ca(grid); LatticeMatrixType lie(grid); @@ -629,6 +629,7 @@ public: MatrixType ta; lie = Zero(); + for (int a = 0; a < AdjointDimension; a++) { random(pRNG, ca); @@ -640,6 +641,7 @@ public: la = ci * ca * ta; lie = lie + la; // e^{i la ta} + } taExp(lie, out); } diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 5602420b..3b9ae08e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -67,6 +67,7 @@ void Gather_plane_simple_table (Vector >& table,const Lattice { int num=table.size(); std::pair *table_v = & table[0]; + auto rhs_v = rhs.View(AcceleratorRead); accelerator_forNB( i,num, vobj::Nsimd(), { typedef decltype(coalescedRead(buffer[0])) compressed_t; @@ -75,6 +76,7 @@ void Gather_plane_simple_table (Vector >& table,const Lattice compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); coalescedWrite(buffer[off+o],tmp_c); }); + rhs_v.ViewClose(); // Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table } @@ -104,6 +106,7 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic so+tp[2*j+1].second, type); }); + rhs_v.ViewClose(); } struct StencilEntry { @@ -181,31 +184,30 @@ class CartesianStencilAccelerator { template class CartesianStencilView : public CartesianStencilAccelerator { -#ifndef GRID_UVM - std::shared_ptr Deleter; -#endif + private: + int *closed; + StencilEntry *cpu_ptr; + ViewMode mode; public: - // -#ifdef GRID_UVM - CartesianStencilView (const CartesianStencilAccelerator &refer_to_me,ViewMode mode) - : CartesianStencilAccelerator(refer_to_me){}; -#else - CartesianStencilView (const CartesianStencilView &refer_to_me) - : CartesianStencilAccelerator(refer_to_me), Deleter(refer_to_me.Deleter) - { } - CartesianStencilView (const CartesianStencilAccelerator &refer_to_me,ViewMode mode) - : CartesianStencilAccelerator(refer_to_me), Deleter(new MemViewDeleter) - { - Deleter->cpu_ptr =(void *)this->_entries_p; - Deleter->mode = mode; - this->_entries_p =(StencilEntry *) + // default copy constructor + CartesianStencilView (const CartesianStencilView &refer_to_me) = default; + CartesianStencilView (const CartesianStencilAccelerator &refer_to_me,ViewMode _mode) + : CartesianStencilAccelerator(refer_to_me), + cpu_ptr(this->_entries_p), + mode(_mode) + { + this->_entries_p =(StencilEntry *) MemoryManager::ViewOpen(this->_entries_p, - this->_npoints*this->_osites*sizeof(StencilEntry), - mode, - AdviseDefault); - } -#endif + this->_npoints*this->_osites*sizeof(StencilEntry), + mode, + AdviseDefault); + } + + void ViewClose(void) + { + MemoryManager::ViewClose(this->cpu_ptr,this->mode); + } }; diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 917c1c34..d049fd2f 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -36,7 +36,6 @@ void acceleratorInit(void) #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); - cudaGetDeviceProperties(&gpu_props[i], i); if ( world_rank == 0) { cudaDeviceProp prop; @@ -57,6 +56,8 @@ void acceleratorInit(void) } } MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours +#undef GPU_PROP_FMT +#undef GPU_PROP #ifdef GRID_IBM_SUMMIT // IBM Jsrun makes cuda Device numbering screwy and not match rank @@ -117,6 +118,8 @@ void acceleratorInit(void) // GPU_PROP(singleToDoublePrecisionPerfRatio); } } +#undef GPU_PROP_FMT +#undef GPU_PROP #ifdef GRID_IBM_SUMMIT // IBM Jsrun makes cuda Device numbering screwy and not match rank if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n"); @@ -162,17 +165,18 @@ void acceleratorInit(void) for(int d = 0;d().c_str()); + printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info().c_str()); #define GPU_PROP_FMT(prop,FMT) \ - printf("AcceleratorSyclInit: " #prop ": " FMT" \n",prop,devices[d].get_info()); + printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info()); -#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%d"); +#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); GPU_PROP_STR(vendor); GPU_PROP_STR(version); - GPU_PROP_STR(device_type); - GPU_PROP_STR(max_compute_units); + // GPU_PROP_STR(device_type); + /* + GPU_PROP(max_compute_units); GPU_PROP(native_vector_width_char); GPU_PROP(native_vector_width_short); GPU_PROP(native_vector_width_int); @@ -183,7 +187,8 @@ void acceleratorInit(void) GPU_PROP(address_bits); GPU_PROP(half_fp_config); GPU_PROP(single_fp_config); - GPU_PROP(double_fp_config); + */ + // GPU_PROP(double_fp_config); GPU_PROP(global_mem_size); } diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 08e25668..0d904225 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -252,9 +252,9 @@ public: double start=usecond(); for(int i=0;i U(4,FGrid); - auto Umu_v = Umu.View(CpuRead); - auto Umu5d_v = Umu5d.View(CpuWrite); - for(int ss=0;ssoSites();ss++){ - for(int s=0;soSites();ss++){ + for(int s=0;s U(4,FGrid); { - auto Umu5d_v = Umu5d.View(CpuWrite); - auto Umu_v = Umu.View(CpuRead); + autoView( Umu5d_v, Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;s & latt4, int Ls, int threads,int report ) LatticeGaugeField Umu5d(FGrid); // replicate across fifth dimension - auto Umu5d_v = Umu5d.View(CpuWrite); - auto Umu_v = Umu.View(CpuRead); - for(int ss=0;ssoSites();ss++){ - for(int s=0;soSites();ss++){ + for(int s=0;s > &mat, for(int b=0;b > &mat, for(int b=0;b > &mat int ss= so+n*stride+b; for(int i=0;i > &m for(int i=0;i > &m // Trigger unroll for ( int m=0;m using namespace std; using namespace Grid; - ; template struct scal { @@ -51,6 +50,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; + { GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -100,6 +100,8 @@ int main (int argc, char ** argv) ConjugateGradient CG(1.0e-8,10000); CG(HermOpEO,src_o,result_o_2); + MemoryManager::Print(); + LatticeFermionD diff_o(FrbGrid); RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); @@ -130,7 +132,9 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << " CG checksums "<oSites();i++){ auto SE = gStencil.GetEntry(0,i); - auto check = Check.View(CpuWrite); - auto foo = Foo.View(CpuRead); - + autoView(check, Check, CpuWrite); + autoView( foo, Foo, CpuRead); // Encapsulate in a general wrapper check[i] = foo[SE->_offset]; auto tmp=check[i]; if (SE->_permute & 0x1 ) { permute(check[i],tmp,0); tmp=check[i];} @@ -147,8 +146,8 @@ int main(int argc, char ** argv) }}}} if (nrm > 1.0e-4) { - auto check = Check.View(CpuRead); - auto bar = Bar.View(CpuRead); + autoView( check , Check, CpuRead); + autoView( bar , Bar, CpuRead); for(int i=0;i_is_local && SE->_permute ) permute(check[i],foo[SE->_offset],permute_type); else if (SE->_is_local) @@ -151,8 +151,8 @@ int main(int argc, char ** argv) { }}}} if (nrm > 1.0e-4) { - auto check = Check.View(CpuRead); - auto bar = Bar.View(CpuRead); + autoView( check , Check, CpuRead); + autoView( bar , Bar, CpuRead); for(int i=0;i " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(ocheck[i],efoo[SE->_offset],permute_type); else if (SE->_is_local) @@ -226,8 +226,8 @@ int main(int argc, char ** argv) { SE = OStencil.GetEntry(permute_type,0,i); // std::cout << "ODD source "<< i<<" -> " <_offset << " "<< SE->_is_local<_is_local && SE->_permute ) permute(echeck[i],ofoo[SE->_offset],permute_type); else if (SE->_is_local) diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc index 402e69d5..e4cd007f 100644 --- a/tests/core/Test_staggered5D.cc +++ b/tests/core/Test_staggered5D.cc @@ -89,8 +89,8 @@ int main (int argc, char ** argv) //////////////////////////////////// LatticeGaugeField Umu5d(FGrid); { - auto umu5d = Umu5d.View(CpuWrite); - auto umu = Umu.View(CpuRead); + autoView(umu5d, Umu5d, CpuWrite); + autoView( umu, Umu , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;s U(4,FGrid); { - auto Umu5d_v = Umu5d.View(CpuWrite); - auto Umu_v = Umu.View(CpuRead); + autoView( Umu5d_v , Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); for(int ss=0;ssoSites();ss++){ for(int s=0;soSites(),{ uint64_t ss= sss*Ls; typedef vSpinColourVector spinor; diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc index 4c3a3f53..cb30faad 100644 --- a/tests/forces/Test_contfrac_force.cc +++ b/tests/forces/Test_contfrac_force.cc @@ -98,9 +98,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc index fea867e6..81a1b8c4 100644 --- a/tests/forces/Test_dwf_force.cc +++ b/tests/forces/Test_dwf_force.cc @@ -100,9 +100,9 @@ int main (int argc, char ** argv) // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc index 5b864279..0b0ba346 100644 --- a/tests/forces/Test_dwf_force_eofa.cc +++ b/tests/forces/Test_dwf_force_eofa.cc @@ -110,9 +110,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc index a0743edc..b39fdd14 100644 --- a/tests/forces/Test_dwf_gpforce.cc +++ b/tests/forces/Test_dwf_gpforce.cc @@ -119,9 +119,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc index 69b9adec..58258a5e 100644 --- a/tests/forces/Test_dwf_gpforce_eofa.cc +++ b/tests/forces/Test_dwf_gpforce_eofa.cc @@ -114,9 +114,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc index 5de7ddb7..21f0b9d0 100644 --- a/tests/forces/Test_gp_plaq_force.cc +++ b/tests/forces/Test_gp_plaq_force.cc @@ -85,9 +85,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto Uprime_v = Uprime.View(CpuWrite); - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc index 026ce60f..bb4ea6de 100644 --- a/tests/forces/Test_gp_rect_force.cc +++ b/tests/forces/Test_gp_rect_force.cc @@ -87,9 +87,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto Uprime_v= Uprime.View(CpuWrite); - auto U_v = U.View(CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc index 22927d01..bdc332d9 100644 --- a/tests/forces/Test_gpdwf_force.cc +++ b/tests/forces/Test_gpdwf_force.cc @@ -105,9 +105,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc index 41c4641d..1c85a5d9 100644 --- a/tests/forces/Test_gpwilson_force.cc +++ b/tests/forces/Test_gpwilson_force.cc @@ -99,9 +99,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc index daab4149..11e69652 100644 --- a/tests/forces/Test_mobius_force.cc +++ b/tests/forces/Test_mobius_force.cc @@ -101,9 +101,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc index 7a8d4cf8..f85501fa 100644 --- a/tests/forces/Test_mobius_force_eofa.cc +++ b/tests/forces/Test_mobius_force_eofa.cc @@ -112,9 +112,9 @@ int main (int argc, char** argv) PokeIndex(mom, mommu, mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc index 13de233b..68163e63 100644 --- a/tests/forces/Test_mobius_gpforce_eofa.cc +++ b/tests/forces/Test_mobius_gpforce_eofa.cc @@ -115,9 +115,9 @@ int main (int argc, char** argv) SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); // fourth order exponential approx thread_foreach( i, mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt + mom_v[i](mu) *mom_v[i](mu) *U_v[i](mu)*(dt*dt/2.0) diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc index 9292274e..17dce530 100644 --- a/tests/forces/Test_partfrac_force.cc +++ b/tests/forces/Test_partfrac_force.cc @@ -101,9 +101,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc index 909068c2..ed72f2c0 100644 --- a/tests/forces/Test_rect_force.cc +++ b/tests/forces/Test_rect_force.cc @@ -87,9 +87,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto Uprime_v = Uprime.View(CpuWrite); - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ; }); diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc index 397dc40c..c8b3a7f4 100644 --- a/tests/forces/Test_wilson_force.cc +++ b/tests/forces/Test_wilson_force.cc @@ -105,9 +105,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach( i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu); Uprime_v[i](mu) += mom_v[i](mu)*U_v[i](mu)*dt ; diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc index ff664e19..f26f0ac9 100644 --- a/tests/forces/Test_wilsonclover_force.cc +++ b/tests/forces/Test_wilsonclover_force.cc @@ -105,9 +105,9 @@ int main(int argc, char **argv) Hmom -= real(sum(trace(mommu * mommu))); PokeIndex(mom, mommu, mu); - auto Uprime_v = Uprime.View(CpuWrite); - auto U_v = U.View(CpuRead); - auto mom_v = mom.View(CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); + autoView( U_v , U, CpuRead); + autoView( mom_v, mom, CpuRead); thread_foreach(ss,mom_v, { Uprime_v[ss]._internal[mu] = ProjectOnGroup(Exponentiate(mom_v[ss]._internal[mu], dt, 12) * U_v[ss]._internal[mu]); diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc index 2ed12acd..e24ae601 100644 --- a/tests/forces/Test_zmobius_force.cc +++ b/tests/forces/Test_zmobius_force.cc @@ -114,9 +114,9 @@ int main (int argc, char ** argv) PokeIndex(mom,mommu,mu); // fourth order exponential approx - auto mom_v = mom.View(CpuRead); - auto U_v = U.View(CpuRead); - auto Uprime_v = Uprime.View(CpuWrite); + autoView( mom_v, mom, CpuRead); + autoView( U_v , U, CpuRead); + autoView(Uprime_v, Uprime, CpuWrite); thread_foreach(i,mom_v,{ Uprime_v[i](mu) = U_v[i](mu) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index f93af852..8e083231 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -300,8 +300,8 @@ int main (int argc, char ** argv) int nb=nbasisc/2; CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0); for(int n=0;noSites();site++){ subspace_g5[site](nn) = subspace[site](nn);