From 04863f8f3835bbfec9f913139eddc1dca717bbb2 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 4 May 2020 16:07:03 -0400 Subject: [PATCH] debug new AcceleratorView --- Grid/lattice/Lattice_ET.h | 3 +- Grid/lattice/Lattice_arith.h | 69 ++++++++++++++++--------------- Grid/lattice/Lattice_base.h | 71 +++++++++++++++++++++++++++----- Grid/lattice/Lattice_reduction.h | 14 +++---- 4 files changed, 105 insertions(+), 52 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index cf7147b9..da63d5e6 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -9,6 +9,7 @@ Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: neo +Author: Christoph Lehner &arg) template accelerator_inline const lobj & eval(const uint64_t ss, const Lattice &arg) { - auto view = arg.View(); + auto view = arg.AcceleratorView(ViewRead); return view[ss]; } diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 3543d6aa..c4a67620 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -7,6 +7,7 @@ Copyright (C) 2015 Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); conformable(ret,rhs); conformable(lhs,rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ @@ -55,9 +56,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -72,9 +73,9 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -88,9 +89,9 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -107,8 +108,8 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs_v(ss),&rhs); @@ -120,8 +121,8 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -134,8 +135,8 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -147,8 +148,8 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -164,8 +165,8 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -178,8 +179,8 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -192,8 +193,8 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -205,8 +206,8 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -220,9 +221,9 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto x_v = x.AcceleratorView(ViewRead); + auto y_v = y.AcceleratorView(ViewRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+y_v(ss); coalescedWrite(ret_v[ss],tmp); @@ -233,9 +234,9 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto x_v = x.AcceleratorView(ViewRead); + auto y_v = y.AcceleratorView(ViewRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 157c647b..30aa6b06 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -54,8 +54,20 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) // Advise the LatticeAccelerator class //////////////////////////////////////////////////////////////////////////// enum LatticeAcceleratorAdvise { - AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can - // significantly influence performance of bulk storage. + AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can + // significantly influence performance of bulk storage. + AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures + // enables read-only copies of memory to be kept on + // host and device. +}; + +//////////////////////////////////////////////////////////////////////////// +// View Access Mode +//////////////////////////////////////////////////////////////////////////// +enum ViewMode { + ViewRead = 0x1, + ViewWrite = 0x2, + ViewReadWrite = 0x3 }; //////////////////////////////////////////////////////////////////////////// @@ -91,6 +103,29 @@ public: if (advise & AdviseInfrequentUse) { cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); } + if (advise & AdviseReadMostly) { + cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1); + } +#endif +#endif + }; + + accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + int target; + cudaGetDevice(&target); + cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); + std::cout<< GridLogMessage << "To Device " << target << std::endl; +#endif +#endif + }; + + accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); + std::cout<< GridLogMessage << "To Host" << std::endl; #endif #endif }; @@ -225,9 +260,23 @@ public: // The view is trivially copy constructible and may be copied to an accelerator device // in device lambdas ///////////////////////////////////////////////////////////////////////////////// - LatticeView View (void) const + LatticeView View (void) const // deprecated, should pick AcceleratorView for accelerator_for + { // and HostView for thread_for + LatticeView accessor(*( (LatticeAccelerator *) this)); + return accessor; + } + + LatticeView AcceleratorView(int mode = ViewReadWrite) const { LatticeView accessor(*( (LatticeAccelerator *) this)); + accessor.AcceleratorPrefetch(mode); + return accessor; + } + + LatticeView HostView(int mode = ViewReadWrite) const + { + LatticeView accessor(*( (LatticeAccelerator *) this)); + accessor.HostPrefetch(mode); return accessor; } @@ -251,7 +300,7 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -270,7 +319,7 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -288,7 +337,7 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -399,8 +448,9 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(); - auto him= r.View(); + std::cout << GridLogMessage << "Copy other" << std::endl; + auto me = AcceleratorView(ViewWrite); + auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); @@ -413,8 +463,9 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(); - auto him= r.View(); + std::cout << GridLogMessage << "Copy same" << std::endl; + auto me = AcceleratorView(ViewWrite); + auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 8acbde66..3147823d 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -103,8 +103,8 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & GridBase *grid = left.Grid(); // Might make all code paths go this way. - auto left_v = left.View(); - auto right_v=right.View(); + auto left_v = left.AcceleratorView(ViewRead); + auto right_v=right.AcceleratorView(ViewRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -175,9 +175,9 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt GridBase *grid = x.Grid(); - auto x_v=x.View(); - auto y_v=y.View(); - auto z_v=z.View(); + auto x_v=x.AcceleratorView(ViewRead); + auto y_v=y.AcceleratorView(ViewRead); + auto z_v=z.AcceleratorView(ViewWrite); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -224,8 +224,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti GridBase *grid = left.Grid(); - auto left_v=left.View(); - auto right_v=right.View(); + auto left_v=left.AcceleratorView(ViewRead); + auto right_v=right.AcceleratorView(ViewRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites();