diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index a44b245c..3d0a2a75 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -37,211 +37,6 @@ Author: Christoph Lehner NAMESPACE_BEGIN(Grid); - //////////////////////////////////////////////////////// - // Move following 100 LOC to lattice/Lattice_basis.h - //////////////////////////////////////////////////////// -template -void basisOrthogonalize(std::vector &basis,Field &w,int k) -{ - // If assume basis[j] are already orthonormal, - // can take all inner products in parallel saving 2x bandwidth - // Save 3x bandwidth on the second line of loop. - // perhaps 2.5x speed up. - // 2x overall in Multigrid Lanczos - for(int j=0; j -void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) -{ - typedef decltype(basis[0].View()) View; - auto tmp_v = basis[0].View(); - Vector basis_v(basis.size(),tmp_v); - typedef typename Field::vector_object vobj; - GridBase* grid = basis[0].Grid(); - - for(int k=0;k > Bt(thread_max() * Nm); // Thread private - thread_region - { - vobj* B = Bt.data() + Nm * thread_num(); - - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(); - uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead - - // printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock); - - Vector Bt(siteBlock * nrot); - auto Bp=&Bt[0]; - - // GPU readable copy of Eigen matrix - Vector Qt_jv(Nm*Nm); - double *Qt_p = & Qt_jv[0]; - for(int k=0;k -void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) -{ - typedef decltype(basis[0].View()) View; - typedef typename Field::vector_object vobj; - GridBase* grid = basis[0].Grid(); - - result.Checkerboard() = basis[0].Checkerboard(); - auto result_v=result.View(); - Vector basis_v(basis.size(),result_v); - for(int k=0;k Qt_jv(Nm); - double * Qt_j = & Qt_jv[0]; - for(int k=0;koSites(),vobj::Nsimd(),{ - auto B=coalescedRead(basis_v[k0][ss]); - B=Zero(); - for(int k=k0; k -void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) -{ - int vlen = idx.size(); - - assert(vlen>=1); - assert(vlen<=sort_vals.size()); - assert(vlen<=_v.size()); - - for (size_t i=0;ii for which _vnew[j] = _vold[i], - // track the move idx[j] => idx[i] - // track the move idx[i] => i - ////////////////////////////////////// - size_t j; - for (j=i;j i); assert(j!=idx.size()); assert(idx[j]==i); - - swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy - std::swap(sort_vals[i],sort_vals[idx[i]]); - - idx[j] = idx[i]; - idx[i] = i; - } - } -} - -inline std::vector basisSortGetIndex(std::vector& sort_vals) -{ - std::vector idx(sort_vals.size()); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { - return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); - }); - return idx; -} - -template -void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) -{ - std::vector idx = basisSortGetIndex(sort_vals); - if (reverse) - std::reverse(idx.begin(), idx.end()); - - basisReorderInPlace(_v,sort_vals,idx); -} - -// PAB: faster to compute the inner products first then fuse loops. -// If performance critical can improve. -template -void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { - result = Zero(); - assert(_v.size()==eval.size()); - int N = (int)_v.size(); - for (int i=0;i #include #include #include -#include +//#include #include #include #include -#include +//#include #include #include #include @@ -43,4 +43,4 @@ Author: Peter Boyle #include #include #include - +#include diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index cf7147b9..da63d5e6 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -9,6 +9,7 @@ Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: neo +Author: Christoph Lehner &arg) template accelerator_inline const lobj & eval(const uint64_t ss, const Lattice &arg) { - auto view = arg.View(); + auto view = arg.AcceleratorView(ViewRead); return view[ss]; } diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 3543d6aa..c4a67620 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -7,6 +7,7 @@ Copyright (C) 2015 Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); conformable(ret,rhs); conformable(lhs,rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ @@ -55,9 +56,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -72,9 +73,9 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -88,9 +89,9 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -107,8 +108,8 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs_v(ss),&rhs); @@ -120,8 +121,8 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -134,8 +135,8 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -147,8 +148,8 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -164,8 +165,8 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -178,8 +179,8 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -192,8 +193,8 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -205,8 +206,8 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -220,9 +221,9 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto x_v = x.AcceleratorView(ViewRead); + auto y_v = y.AcceleratorView(ViewRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+y_v(ss); coalescedWrite(ret_v[ss],tmp); @@ -233,9 +234,9 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto x_v = x.AcceleratorView(ViewRead); + auto y_v = y.AcceleratorView(ViewRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index ec7c54ec..74525cc1 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -9,6 +9,7 @@ Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: paboyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,6 +50,26 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) assert(lhs == rhs); } +//////////////////////////////////////////////////////////////////////////// +// Advise the LatticeAccelerator class +//////////////////////////////////////////////////////////////////////////// +enum LatticeAcceleratorAdvise { + AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can + // significantly influence performance of bulk storage. + AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures + // enables read-only copies of memory to be kept on + // host and device. +}; + +//////////////////////////////////////////////////////////////////////////// +// View Access Mode +//////////////////////////////////////////////////////////////////////////// +enum ViewMode { + ViewRead = 0x1, + ViewWrite = 0x2, + ViewReadWrite = 0x3 +}; + //////////////////////////////////////////////////////////////////////////// // Minimal base class containing only data valid to access from accelerator // _odata will be a managed pointer in CUDA @@ -75,6 +96,37 @@ public: if (grid) conformable(grid, _grid); else grid = _grid; }; + + accelerator_inline void Advise(int advise) { +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + if (advise & AdviseInfrequentUse) { + cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); + } + if (advise & AdviseReadMostly) { + cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1); + } +#endif +#endif + }; + + accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + int target; + cudaGetDevice(&target); + cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); +#endif +#endif + }; + + accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); +#endif +#endif + }; }; ///////////////////////////////////////////////////////////////////////////////////////// @@ -206,9 +258,23 @@ public: // The view is trivially copy constructible and may be copied to an accelerator device // in device lambdas ///////////////////////////////////////////////////////////////////////////////// - LatticeView View (void) const + LatticeView View (void) const // deprecated, should pick AcceleratorView for accelerator_for + { // and HostView for thread_for + LatticeView accessor(*( (LatticeAccelerator *) this)); + return accessor; + } + + LatticeView AcceleratorView(int mode = ViewReadWrite) const { LatticeView accessor(*( (LatticeAccelerator *) this)); + accessor.AcceleratorPrefetch(mode); + return accessor; + } + + LatticeView HostView(int mode = ViewReadWrite) const + { + LatticeView accessor(*( (LatticeAccelerator *) this)); + accessor.HostPrefetch(mode); return accessor; } @@ -232,7 +298,7 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -251,7 +317,7 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -269,7 +335,7 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -357,7 +423,6 @@ public: // copy constructor /////////////////////////////////////////// Lattice(const Lattice& r){ - // std::cout << "Lattice constructor(const Lattice &) "<_grid = r.Grid(); resize(this->_grid->oSites()); *this = r; @@ -380,8 +445,8 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(); - auto him= r.View(); + auto me = AcceleratorView(ViewWrite); + auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); @@ -394,8 +459,8 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(); - auto him= r.View(); + auto me = AcceleratorView(ViewWrite); + auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h new file mode 100644 index 00000000..f1126936 --- /dev/null +++ b/Grid/lattice/Lattice_basis.h @@ -0,0 +1,236 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/lattice/Lattice_basis.h + +Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle +Author: Christoph Lehner + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ + +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +void basisOrthogonalize(std::vector &basis,Field &w,int k) +{ + // If assume basis[j] are already orthonormal, + // can take all inner products in parallel saving 2x bandwidth + // Save 3x bandwidth on the second line of loop. + // perhaps 2.5x speed up. + // 2x overall in Multigrid Lanczos + for(int j=0; j +void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) +{ + typedef decltype(basis[0]) Field; + typedef decltype(basis[0].View()) View; + auto tmp_v = basis[0].AcceleratorView(ViewReadWrite); + Vector basis_v(basis.size(),tmp_v); + typedef typename std::remove_reference::type vobj; + GridBase* grid = basis[0].Grid(); + + for(int k=0;k B(Nm); // Thread private + thread_for_in_region(ss, grid->oSites(),{ + for(int j=j0; joSites(); + uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead + + Vector Bt(siteBlock * nrot); + auto Bp=&Bt[0]; + + // GPU readable copy of matrix + Vector Qt_jv(Nm*Nm); + double *Qt_p = & Qt_jv[0]; + thread_for(i,Nm*Nm,{ + int j = i/Nm; + int k = i%Nm; + Qt_p[i]=Qt(j,k); + }); + + // Block the loop to keep storage footprint down + for(uint64_t s=0;s +void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) +{ + typedef decltype(basis[0].AcceleratorView()) View; + typedef typename Field::vector_object vobj; + GridBase* grid = basis[0].Grid(); + + result.Checkerboard() = basis[0].Checkerboard(); + auto result_v=result.AcceleratorView(ViewWrite); + Vector basis_v(basis.size(),result_v); + for(int k=0;k Qt_jv(Nm); + double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ + auto B=coalescedRead(zz); + for(int k=k0; k +void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) +{ + int vlen = idx.size(); + + assert(vlen>=1); + assert(vlen<=sort_vals.size()); + assert(vlen<=_v.size()); + + for (size_t i=0;ii for which _vnew[j] = _vold[i], + // track the move idx[j] => idx[i] + // track the move idx[i] => i + ////////////////////////////////////// + size_t j; + for (j=i;j i); assert(j!=idx.size()); assert(idx[j]==i); + + swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy + std::swap(sort_vals[i],sort_vals[idx[i]]); + + idx[j] = idx[i]; + idx[i] = i; + } + } +} + +inline std::vector basisSortGetIndex(std::vector& sort_vals) +{ + std::vector idx(sort_vals.size()); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { + return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); + }); + return idx; +} + +template +void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) +{ + std::vector idx = basisSortGetIndex(sort_vals); + if (reverse) + std::reverse(idx.begin(), idx.end()); + + basisReorderInPlace(_v,sort_vals,idx); +} + +// PAB: faster to compute the inner products first then fuse loops. +// If performance critical can improve. +template +void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { + result = Zero(); + assert(_v.size()==eval.size()); + int N = (int)_v.size(); + for (int i=0;i Author: Peter Boyle Author: paboyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -93,7 +94,7 @@ template inline RealD norm2(const Lattice &arg){ // Double inner product template -inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +inline ComplexD rankInnerProduct(const Lattice &left,const Lattice &right) { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; @@ -102,8 +103,8 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ GridBase *grid = left.Grid(); // Might make all code paths go this way. - auto left_v = left.View(); - auto right_v=right.View(); + auto left_v = left.AcceleratorView(ViewRead); + auto right_v=right.AcceleratorView(ViewRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -137,11 +138,18 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ }) nrm = TensorRemove(sum(inner_tmp_v,sites)); #endif - grid->GlobalSum(nrm); - return nrm; } +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { + GridBase *grid = left.Grid(); + ComplexD nrm = rankInnerProduct(left,right); + grid->GlobalSum(nrm); + return nrm; +} + + ///////////////////////// // Fast axpby_norm // z = a x + b y @@ -167,9 +175,9 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt GridBase *grid = x.Grid(); - auto x_v=x.View(); - auto y_v=y.View(); - auto z_v=z.View(); + auto x_v=x.AcceleratorView(ViewRead); + auto y_v=y.AcceleratorView(ViewRead); + auto z_v=z.AcceleratorView(ViewWrite); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -204,8 +212,64 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt grid->GlobalSum(nrm); return nrm; } - +template strong_inline void +innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Lattice &right) +{ + conformable(left,right); + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + Vector tmp(2); + + GridBase *grid = left.Grid(); + + auto left_v=left.AcceleratorView(ViewRead); + auto right_v=right.AcceleratorView(ViewRead); + + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites = grid->oSites(); + +#ifdef GRID_NVCC + // GPU + typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; + typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t; + Vector inner_tmp(sites); + Vector norm_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + auto norm_tmp_v = &norm_tmp[0]; + + accelerator_for( ss, sites, nsimd,{ + auto left_tmp = left_v(ss); + coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); + coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)); + }); + + tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites)); + tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites)); +#else + // CPU + typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t; + typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t; + Vector inner_tmp(sites); + Vector norm_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + auto norm_tmp_v = &norm_tmp[0]; + + accelerator_for( ss, sites, nsimd,{ + auto left_tmp = left_v(ss); + inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss)); + norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp); + }); + // Already promoted to double + tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); + tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); +#endif + grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector + ip = tmp[0]; + nrm = real(tmp[1]); +} + template inline auto sum(const LatticeUnaryExpression & expr) ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object diff --git a/Grid/lattice/Lattice_trace.h b/Grid/lattice/Lattice_trace.h index 93444e0c..6b5f67d2 100644 --- a/Grid/lattice/Lattice_trace.h +++ b/Grid/lattice/Lattice_trace.h @@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid); //////////////////////////////////////////////////////////////////////////////////////////////////// // Trace //////////////////////////////////////////////////////////////////////////////////////////////////// +/* template inline auto trace(const Lattice &lhs) -> Lattice { @@ -48,6 +49,7 @@ inline auto trace(const Lattice &lhs) -> Lattice }); return ret; }; +*/ //////////////////////////////////////////////////////////////////////////////////////////////////// // Trace Index level dependent operation diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c80e7db2..c23ddcdc 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -6,6 +6,7 @@ Copyright (C) 2015 Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -63,6 +64,7 @@ template inline void pickCheckerboard(int cb,Lattice &half,con } }); } + template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.Checkerboard(); auto half_v = half.View(); @@ -81,25 +83,130 @@ template inline void setCheckerboard(Lattice &full,const Latti } }); } - -template + +//////////////////////////////////////////////////////////////////////////////////////////// +// Flexible Type Conversion for internal promotion to double as well as graceful +// treatment of scalar-compatible types +//////////////////////////////////////////////////////////////////////////////////////////// +accelerator_inline void convertType(ComplexD & out, const std::complex & in) { + out = in; +} + +accelerator_inline void convertType(ComplexF & out, const std::complex & in) { + out = in; +} + +#ifdef __CUDA_ARCH__ +accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { + ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; +} +accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { + ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in; +} +accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { + ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in; +} +#endif + +accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) { + out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v); +} + +accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { + Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); +} + +template + accelerator_inline void convertType(iMatrix & out, const iMatrix & in); +template + accelerator_inline void convertType(iVector & out, const iVector & in); + +template::value, T1>::type* = nullptr> +accelerator_inline void convertType(T1 & out, const iScalar & in) { + convertType(out,in._internal); +} + +template +accelerator_inline void convertType(iScalar & out, const T2 & in) { + convertType(out._internal,in); +} + +template +accelerator_inline void convertType(iMatrix & out, const iMatrix & in) { + for (int i=0;i +accelerator_inline void convertType(iVector & out, const iVector & in) { + for (int i=0;i::value, T>::type* = nullptr> +accelerator_inline void convertType(T & out, const T & in) { + out = in; +} + +template +accelerator_inline void convertType(Lattice & out, const Lattice & in) { + auto out_v = out.AcceleratorView(ViewWrite); + auto in_v = in.AcceleratorView(ViewRead); + + accelerator_for(ss,out_v.size(),T1::Nsimd(),{ + convertType(out_v[ss],in_v(ss)); + }); +} + +//////////////////////////////////////////////////////////////////////////////////////////// +// precision-promoted local inner product +//////////////////////////////////////////////////////////////////////////////////////////// +template +inline auto localInnerProductD(const Lattice &lhs,const Lattice &rhs) +-> Lattice> +{ + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); + + typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; + Lattice> ret(lhs.Grid()); + auto ret_v = ret.AcceleratorView(ViewWrite); + + accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ + convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); + }); + + return ret; +} + +//////////////////////////////////////////////////////////////////////////////////////////// +// block routines +//////////////////////////////////////////////////////////////////////////////////////////// +template inline void blockProject(Lattice > &coarseData, - const Lattice &fineData, - const std::vector > &Basis) + const Lattice &fineData, + const VLattice &Basis) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); - Lattice ip(coarse); + Lattice> ip(coarse); + Lattice fineDataRed = fineData; // auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); - auto ip_ = ip.View(); + auto coarseData_ = coarseData.AcceleratorView(ViewWrite); + auto ip_ = ip.AcceleratorView(ViewReadWrite); for(int v=0;v accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { - coalescedWrite(coarseData_[sc](v),ip_(sc)); + convertType(coarseData_[sc](v),ip_[sc]); }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } @@ -166,11 +273,11 @@ inline void blockProject1(Lattice > &coarseData, return; } -template -inline void blockZAXPY(Lattice &fineZ, - const Lattice &coarseA, - const Lattice &fineX, - const Lattice &fineY) +template + inline void blockZAXPY(Lattice &fineZ, + const Lattice &coarseA, + const Lattice &fineX, + const Lattice &fineY) { GridBase * fine = fineZ.Grid(); GridBase * coarse= coarseA.Grid(); @@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice &fineZ, conformable(fineX,fineZ); int _ndimension = coarse->_ndimension; - + Coordinate block_r (_ndimension); // FIXME merge with subdivide checking routine as this is redundant @@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice &fineZ, assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } - auto fineZ_ = fineZ.View(); - auto fineX_ = fineX.View(); - auto fineY_ = fineY.View(); - auto coarseA_= coarseA.View(); + auto fineZ_ = fineZ.AcceleratorView(ViewWrite); + auto fineX_ = fineX.AcceleratorView(ViewRead); + auto fineY_ = fineY.AcceleratorView(ViewRead); + auto coarseA_= coarseA.AcceleratorView(ViewRead); accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { - - int sc; - Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + int sc; + Coordinate coor_c(_ndimension); + Coordinate coor_f(_ndimension); - // z = A x + y - coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - }); + // z = A x + y +#ifdef __CUDA_ARCH__ + typename vobj2::tensor_reduced::scalar_object cA; + typename vobj::scalar_object cAx; +#else + typename vobj2::tensor_reduced cA; + vobj cAx; +#endif + convertType(cA,TensorRemove(coarseA_(sc))); + auto prod = cA*fineX_(sf); + convertType(cAx,prod); + coalescedWrite(fineZ_[sf],cAx+fineY_(sf)); + + }); return; } + template + inline void blockInnerProductD(Lattice &CoarseInner, + const Lattice &fineX, + const Lattice &fineY) +{ + typedef iScalar dotp; + + GridBase *coarse(CoarseInner.Grid()); + GridBase *fine (fineX.Grid()); + + Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); + Lattice coarse_inner(coarse); + + auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); + auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); + + // Precision promotion + fine_inner = localInnerProductD(fineX,fineY); + blockSum(coarse_inner,fine_inner); + accelerator_for(ss, coarse->oSites(), 1, { + convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); + }); + +} + +template // deprecate inline void blockInnerProduct(Lattice &CoarseInner, const Lattice &fineX, const Lattice &fineY) @@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice &CoarseInner, Lattice coarse_inner(coarse); // Precision promotion? - auto CoarseInner_ = CoarseInner.View(); - auto coarse_inner_ = coarse_inner.View(); + auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); + auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); @@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice &CoarseInner, CoarseInner_[ss] = coarse_inner_[ss]; }); } + template inline void blockNormalise(Lattice &ip,Lattice &fineX) { @@ -248,7 +392,7 @@ inline void blockNormalise(Lattice &ip,Lattice &fineX) // useful in multigrid project; // Generic name : Coarsen? template -inline void blockSum(Lattice &coarseData,const Lattice &fineData) +inline void blockSum(Lattice &coarseData,const Lattice &fineData) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); @@ -256,42 +400,41 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) subdivides(coarse,fine); // require they map int _ndimension = coarse->_ndimension; - + Coordinate block_r (_ndimension); - + for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } int blockVol = fine->oSites()/coarse->oSites(); - // Turn this around to loop threaded over sc and interior loop - // over sf would thread better - auto coarseData_ = coarseData.View(); - auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite); + auto fineData_ = fineData.AcceleratorView(ViewRead); accelerator_for(sc,coarse->oSites(),1,{ - // One thread per sub block - Coordinate coor_c(_ndimension); - Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + // One thread per sub block + Coordinate coor_c(_ndimension); + Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate + coarseData_[sc]=Zero(); - for(int sb=0;sb_rdimensions); + for(int sb=0;sb_rdimensions); - }); + coarseData_[sc]=coarseData_[sc]+fineData_[sf]; + } + + }); return; } + template inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice &picked,Coordinate coor) { @@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice -inline void blockOrthogonalise(Lattice &ip,std::vector > &Basis) +template +inline void blockOrthonormalize(Lattice &ip,VLattice &Basis) { GridBase *coarse = ip.Grid(); GridBase *fine = Basis[0].Grid(); @@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice &ip,std::vector > int nbasis = Basis.size() ; // checks - subdivides(coarse,fine); + subdivides(coarse,fine); for(int i=0;i (Basis[v],ip,Basis[u],Basis[v]); + blockZAXPY(Basis[v],ip,Basis[u],Basis[v]); } blockNormalise(ip,Basis[v]); } } +template +inline void blockOrthogonalise(Lattice &ip,std::vector > &Basis) // deprecated inaccurate naming +{ + blockOrthonormalize(ip,Basis); +} + #if 0 +// TODO: CPU optimized version here template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -383,24 +533,18 @@ inline void blockPromote(const Lattice > &coarseData, } #else -template +template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, - const std::vector > &Basis) + const VLattice &Basis) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); - fineData=Zero(); for(int i=0;i > ip = PeekIndex<0>(coarseData,i); - Lattice cip(coarse); - auto cip_ = cip.View(); - auto ip_ = ip.View(); - accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ - coalescedWrite(cip_[sc], ip_(sc)()); - }); - blockZAXPY(fineData,cip,Basis[i],fineData); + auto ip_ = ip.AcceleratorView(ViewRead); + blockZAXPY(fineData,ip,Basis[i],fineData); } } #endif @@ -470,8 +614,8 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; - auto t_v = To.View(); - auto f_v = From.View(); + auto t_v = To.AcceleratorView(ViewWrite); + auto f_v = From.AcceleratorView(ViewRead); accelerator_for(idx,Fg->lSites(),1,{ sobj s; Coordinate Fcoor(nd); diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h index f90c34a9..1f11add9 100644 --- a/Grid/parallelIO/BinaryIO.h +++ b/Grid/parallelIO/BinaryIO.h @@ -341,7 +341,7 @@ class BinaryIO { int ieee32big = (format == std::string("IEEE32BIG")); int ieee32 = (format == std::string("IEEE32")); int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); + int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE")); assert(ieee64||ieee32|ieee64big||ieee32big); assert((ieee64+ieee32+ieee64big+ieee32big)==1); ////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h index 2e211838..4c1cfbdb 100644 --- a/Grid/parallelIO/MetaData.h +++ b/Grid/parallelIO/MetaData.h @@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger { }; }; +template +struct GaugeDoubleStoredMunger{ + void operator()(fobj &in, sobj &out) { + for (int mu = 0; mu < Nds; mu++) { + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} + } + }; +}; + +template +struct GaugeDoubleStoredUnmunger { + void operator()(sobj &in, fobj &out) { + for (int mu = 0; mu < Nds; mu++) { + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} + } + }; +}; + template struct Gauge3x2munger{ void operator() (fobj &in,sobj &out){ diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h index d3b62d1f..5522ba91 100644 --- a/Grid/parallelIO/NerscIO.h +++ b/Grid/parallelIO/NerscIO.h @@ -146,7 +146,7 @@ public: int ieee32big = (format == std::string("IEEE32BIG")); int ieee32 = (format == std::string("IEEE32")); int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); + int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE")); uint32_t nersc_csum,scidac_csuma,scidac_csumb; // depending on datatype, set up munger; diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h new file mode 100644 index 00000000..00911595 --- /dev/null +++ b/Grid/parallelIO/OpenQcdIO.h @@ -0,0 +1,224 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/parallelIO/OpenQcdIO.h + +Copyright (C) 2015 - 2020 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +struct OpenQcdHeader : Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader, + int, Nt, + int, Nx, + int, Ny, + int, Nz, + double, plaq); +}; + +class OpenQcdIO : public BinaryIO { +public: + static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6 + + static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) { + OpenQcdHeader header; + + { + std::ifstream fin(file, std::ios::in | std::ios::binary); + fin.read(reinterpret_cast(&header), sizeof(OpenQcdHeader)); + assert(!fin.fail()); + field.data_start = fin.tellg(); + fin.close(); + } + + header.plaq /= normalisationFactor; + + // sanity check (should trigger on endian issues) + assert(0 < header.Nt && header.Nt <= 1024); + assert(0 < header.Nx && header.Nx <= 1024); + assert(0 < header.Ny && header.Ny <= 1024); + assert(0 < header.Nz && header.Nz <= 1024); + + field.dimension[0] = header.Nx; + field.dimension[1] = header.Ny; + field.dimension[2] = header.Nz; + field.dimension[3] = header.Nt; + + std::cout << GridLogDebug << "header: " << header << std::endl; + std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl; + std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl; + + assert(grid->_ndimension == Nd); + for(int d = 0; d < Nd; d++) + assert(grid->_fdimensions[d] == field.dimension[d]); + + field.plaquette = header.plaq; + + return field.data_start; + } + + template + static inline void readConfiguration(Lattice>& Umu, + FieldMetaData& header, + std::string file) { + typedef Lattice> DoubleStoredGaugeField; + + assert(Ns == 4 and Nd == 4 and Nc == 3); + + auto grid = dynamic_cast(Umu.Grid()); + assert(grid != nullptr); assert(grid->_ndimension == Nd); + + uint64_t offset = readHeader(file, Umu.Grid(), header); + + FieldMetaData clone(header); + + std::string format("IEEE64"); // they always store little endian double precsision + uint32_t nersc_csum, scidac_csuma, scidac_csumb; + + GridCartesian* grid_openqcd = createOpenQcdGrid(grid); + GridRedBlackCartesian* grid_rb = SpaceTimeGrid::makeFourDimRedBlackGrid(grid); + + typedef DoubleStoredColourMatrixD fobj; + typedef typename DoubleStoredGaugeField::vector_object::scalar_object sobj; + typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word; + + word w = 0; + + std::vector iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here + std::vector scalardata(grid->lSites()); + + IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC, + nersc_csum, scidac_csuma, scidac_csumb); + + GridStopWatch timer; + timer.Start(); + + DoubleStoredGaugeField Umu_ds(grid); + + auto munge = GaugeDoubleStoredMunger(); + + Coordinate ldim = grid->LocalDimensions(); + thread_for(idx_g, grid->lSites(), { + Coordinate coor; + grid->LocalIndexToLocalCoor(idx_g, coor); + + bool isOdd = grid_rb->CheckerBoard(coor) == Odd; + + if(!isOdd) continue; + + int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir] + + coor[Xdir] * ldim[Ydir] * ldim[Zdir] + + coor[Ydir] * ldim[Zdir] + + coor[Zdir])/2; + + munge(iodata[idx_o], scalardata[idx_g]); + }); + + grid->Barrier(); timer.Stop(); + std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl; + + timer.Reset(); timer.Start(); + + vectorizeFromLexOrdArray(scalardata, Umu_ds); + + grid->Barrier(); timer.Stop(); + std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl; + + timer.Reset(); timer.Start(); + + undoDoubleStore(Umu, Umu_ds); + + grid->Barrier(); timer.Stop(); + std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; + + GaugeStatistics(Umu, clone); + + RealD plaq_diff = fabs(clone.plaquette - header.plaquette); + + // clang-format off + std::cout << GridLogMessage << "OpenQcd Configuration " << file + << " plaquette " << clone.plaquette + << " header " << header.plaquette + << " difference " << plaq_diff + << std::endl; + // clang-format on + + RealD precTol = (getPrecision::value == 1) ? 2e-7 : 2e-15; + RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code + + if(plaq_diff >= tol) + std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl; + assert(plaq_diff < tol); + + std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl; + } + + template + static inline void writeConfiguration(Lattice>& Umu, + std::string file) { + std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl; + exit(EXIT_FAILURE); + } + +private: + static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) { + // exploit GridCartesian to be able to still use IOobject + Coordinate gdim = grid->GlobalDimensions(); + Coordinate ldim = grid->LocalDimensions(); + Coordinate pcoor = grid->ThisProcessorCoor(); + + // openqcd does rb on the z direction + gdim[Zdir] /= 2; + ldim[Zdir] /= 2; + + // and has the order T X Y Z (from slowest to fastest) + std::swap(gdim[Xdir], gdim[Zdir]); + std::swap(ldim[Xdir], ldim[Zdir]); + std::swap(pcoor[Xdir], pcoor[Zdir]); + + GridCartesian* ret = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid()); + ret->_ldimensions = ldim; + ret->_processor_coor = pcoor; + return ret; + } + + template + static inline void undoDoubleStore(Lattice>& Umu, + Lattice> const& Umu_ds) { + conformable(Umu.Grid(), Umu_ds.Grid()); + Lattice> U(Umu.Grid()); + + // they store T+, T-, X+, X-, Y+, Y-, Z+, Z- + for(int mu_g = 0; mu_g < Nd; ++mu_g) { + int mu_o = (mu_g + 1) % Nd; + U = PeekIndex(Umu_ds, 2 * mu_o) + + Cshift(PeekIndex(Umu_ds, 2 * mu_o + 1), mu_g, +1); + PokeIndex(Umu, U, mu_g); + } + } +}; + +NAMESPACE_END(Grid); diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h new file mode 100644 index 00000000..bab54fe8 --- /dev/null +++ b/Grid/parallelIO/OpenQcdIOChromaReference.h @@ -0,0 +1,281 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h + +Copyright (C) 2015 - 2020 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;} +#define CHECK_VAR(a) { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; } +// #undef CHECK +// #define CHECK + +NAMESPACE_BEGIN(Grid); + +class ParRdr { +private: + bool const swap; + + MPI_Status status; + MPI_File fp; + + int err; + + MPI_Datatype oddSiteType; + MPI_Datatype fileViewType; + + GridBase* grid; + +public: + ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr) + : swap(false) + , grid(gridPtr) { + err = MPI_File_open(comm, const_cast(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp); + assert(err == MPI_SUCCESS); + } + + virtual ~ParRdr() { MPI_File_close(&fp); } + + inline void errInfo(int const err, std::string const& func) { + static char estring[MPI_MAX_ERROR_STRING]; + int eclass = -1, len = 0; + MPI_Error_class(err, &eclass); + MPI_Error_string(err, estring, &len); + std::cerr << func << " - Error " << eclass << ": " << estring << std::endl; + } + + int readHeader(FieldMetaData& field) { + assert((grid->_ndimension == Nd) && (Nd == 4)); + assert(Nc == 3); + + OpenQcdHeader header; + + readBlock(reinterpret_cast(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR); + + header.plaq /= 3.; // TODO change this into normalizationfactor + + // sanity check (should trigger on endian issues) TODO remove? + assert(0 < header.Nt && header.Nt <= 1024); + assert(0 < header.Nx && header.Nx <= 1024); + assert(0 < header.Ny && header.Ny <= 1024); + assert(0 < header.Nz && header.Nz <= 1024); + + field.dimension[0] = header.Nx; + field.dimension[1] = header.Ny; + field.dimension[2] = header.Nz; + field.dimension[3] = header.Nt; + + for(int d = 0; d < Nd; d++) + assert(grid->FullDimensions()[d] == field.dimension[d]); + + field.plaquette = header.plaq; + + field.data_start = sizeof(OpenQcdHeader); + + return field.data_start; + } + + void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) { + err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status); + errInfo(err, "MPI_File_read_at_all"); + // CHECK_VAR(err) + + int read = -1; + MPI_Get_count(&status, datatype, &read); + // CHECK_VAR(read) + assert(nbytes == (uint64_t)read); + assert(err == MPI_SUCCESS); + } + + void createTypes() { + constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd + + err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS); + err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS); + + Coordinate const L = grid->GlobalDimensions(); + Coordinate const l = grid->LocalDimensions(); + Coordinate const i = grid->ThisProcessorCoor(); + + Coordinate sizes({L[2] / 2, L[1], L[0], L[3]}); + Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]}); + Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]}); + + err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS); + err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS); + } + + void freeTypes() { + err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS); + err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS); + } + + bool readGauge(std::vector& domain_buff, FieldMetaData& meta) { + auto hdr_offset = readHeader(meta); + CHECK + createTypes(); + err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS); + CHECK + int const domainSites = grid->lSites(); + domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd + + // the actual READ + constexpr uint64_t cm_size = 2 * Nc * Nc * sizeof(double); // 2_complex + constexpr uint64_t os_size = Nd * 2 * cm_size; // 2_fwdbwd + constexpr uint64_t max_elems = std::numeric_limits::max(); // int adressable elems: floor is fine + uint64_t const n_os = domainSites / 2; + + for(uint64_t os_idx = 0; os_idx < n_os;) { + uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx; + uint64_t const cm = os_idx * Nd * 2; + readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType); + os_idx += read_os; + } + + CHECK + err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + errInfo(err, "MPI_File_set_view1"); + assert(err == MPI_SUCCESS); + freeTypes(); + + std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl; + return true; + } +}; + +class OpenQcdIOChromaReference : public BinaryIO { +public: + template + static inline void readConfiguration(Lattice>& Umu, + Grid::FieldMetaData& header, + std::string file) { + typedef Lattice> DoubledGaugeField; + + assert(Ns == 4 and Nd == 4 and Nc == 3); + + auto grid = Umu.Grid(); + + typedef ColourMatrixD fobj; + + std::vector iodata( + Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file + + { + ParRdr rdr(MPI_COMM_WORLD, file, grid); + rdr.readGauge(iodata, header); + } // equivalent to using binaryio + + std::vector> Umu_ds_scalar(grid->lSites()); + + copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging + + DoubledGaugeField Umu_ds(grid); + + vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds); + + redistribute(Umu, Umu_ds); // equivalent to undoDoublestore + + FieldMetaData clone(header); + + GaugeStatistics(Umu, clone); + + RealD plaq_diff = fabs(clone.plaquette - header.plaquette); + + // clang-format off + std::cout << GridLogMessage << "OpenQcd Configuration " << file + << " plaquette " << clone.plaquette + << " header " << header.plaquette + << " difference " << plaq_diff + << std::endl; + // clang-format on + + RealD precTol = (getPrecision::value == 1) ? 2e-7 : 2e-15; + RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code + + if(plaq_diff >= tol) + std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl; + assert(plaq_diff < tol); + + std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl; + } + +private: + template + static inline void redistribute(Lattice>& Umu, + Lattice> const& Umu_ds) { + Grid::conformable(Umu.Grid(), Umu_ds.Grid()); + Lattice> U(Umu.Grid()); + + U = PeekIndex(Umu_ds, 2) + Cshift(PeekIndex(Umu_ds, 3), 0, +1); PokeIndex(Umu, U, 0); + U = PeekIndex(Umu_ds, 4) + Cshift(PeekIndex(Umu_ds, 5), 1, +1); PokeIndex(Umu, U, 1); + U = PeekIndex(Umu_ds, 6) + Cshift(PeekIndex(Umu_ds, 7), 2, +1); PokeIndex(Umu, U, 2); + U = PeekIndex(Umu_ds, 0) + Cshift(PeekIndex(Umu_ds, 1), 3, +1); PokeIndex(Umu, U, 3); + } + + static inline void copyToLatticeObject(std::vector& u_fb, + std::vector const& node_buff, + GridBase* grid) { + assert(node_buff.size() == Nd * grid->lSites()); + + Coordinate const& l = grid->LocalDimensions(); + + Coordinate coord(Nd); + int& x = coord[0]; + int& y = coord[1]; + int& z = coord[2]; + int& t = coord[3]; + + int buff_idx = 0; + for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering + for(x = 0; x < l[0]; ++x) + for(y = 0; y < l[1]; ++y) + for(z = 0; z < l[2]; ++z) { + if((t + z + y + x) % 2 == 0) continue; + + int local_idx; + Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions()); + for(int mu = 0; mu < 2 * Nd; ++mu) + for(int c1 = 0; c1 < Nc; ++c1) { + for(int c2 = 0; c2 < Nc; ++c2) { + u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2); + } + } + buff_idx += 2 * Nd; + } + + assert(node_buff.size() == buff_idx); + } +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 5744d3bb..9d99d9e7 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -132,14 +132,14 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm); - pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); - pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); + pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm))); + pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm))); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); - pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); - pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); + pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv))); + pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv))); } template diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 1fff4f5a..0ff72789 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -444,19 +444,19 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} #endif } assert(0 && " Kernel optimisation case not covered "); diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index a14aec1b..19bc5aa6 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -59,7 +59,7 @@ public: } static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - return Cshift(adj(Link), mu, -1); + return Cshift(closure(adj(Link)), mu, -1); } static inline GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h index e4d2ce83..cb510953 100644 --- a/Grid/qcd/hmc/HMC_aggregate.h +++ b/Grid/qcd/hmc/HMC_aggregate.h @@ -39,6 +39,10 @@ directory #include #include #include +#include +#if !defined(GRID_COMMS_NONE) +#include +#endif NAMESPACE_CHECK(Ildg); #include diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 0367c9fa..fdd53698 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -485,7 +485,7 @@ public: // Up staple ___ ___ // | | - tmp = Cshift(adj(U[nu]), nu, -1); + tmp = Cshift(closure(adj(U[nu])), nu, -1); tmp = adj(U2[mu]) * tmp; tmp = Cshift(tmp, mu, -2); @@ -519,7 +519,7 @@ public: // // | | - tmp = Cshift(adj(U2[nu]), nu, -2); + tmp = Cshift(closure(adj(U2[nu])), nu, -2); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); tmp = U2[nu] * Cshift(tmp, nu, 2); Stap += Cshift(tmp, mu, 1); diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index 75e42721..dbcbae8d 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -6,6 +6,7 @@ Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: Michael Marshall +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -55,6 +56,7 @@ class GridTensorBase {}; using Complexified = typename Traits::Complexified; \ using Realified = typename Traits::Realified; \ using DoublePrecision = typename Traits::DoublePrecision; \ + using DoublePrecision2= typename Traits::DoublePrecision2; \ static constexpr int TensorLevel = Traits::TensorLevel template diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h index 03f72966..fd651cae 100644 --- a/Grid/tensors/Tensor_inner.h +++ b/Grid/tensors/Tensor_inner.h @@ -8,6 +8,7 @@ Author: Azusa Yamaguchi Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -194,6 +195,79 @@ auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar accelerator_inline + auto innerProductD2 (const iVector& lhs,const iVector& rhs) -> iScalar +{ + typedef decltype(innerProductD2(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + zeroit(ret); + for(int c1=0;c1 accelerator_inline + auto innerProductD2 (const iMatrix& lhs,const iMatrix& rhs) -> iScalar +{ + typedef decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + ret=Zero(); + for(int c1=0;c1 accelerator_inline + auto innerProductD2 (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef decltype(innerProductD2(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD2(lhs._internal,rhs._internal); + return ret; +} + ////////////////////// // Keep same precison ////////////////////// diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 9067d43d..04d7343e 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -6,6 +6,7 @@ Author: Azusa Yamaguchi Author: Peter Boyle Author: Christopher Kelly Author: Michael Marshall +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -37,6 +38,60 @@ NAMESPACE_BEGIN(Grid); template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; + // Traits to identify scalars + template struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; }; + template struct isGridScalar> : public std::true_type { static constexpr bool notvalue = false; }; + + // Store double-precision data in single-precision grids for precision promoted localInnerProductD + template + class TypePair { + public: + T _internal[2]; + TypePair& operator=(const Grid::Zero& o) { + _internal[0] = Zero(); + _internal[1] = Zero(); + return *this; + } + + TypePair operator+(const TypePair& o) const { + TypePair r; + r._internal[0] = _internal[0] + o._internal[0]; + r._internal[1] = _internal[1] + o._internal[1]; + return r; + } + + TypePair& operator+=(const TypePair& o) { + _internal[0] += o._internal[0]; + _internal[1] += o._internal[1]; + return *this; + } + + friend accelerator_inline void add(TypePair* ret, const TypePair* a, const TypePair* b) { + add(&ret->_internal[0],&a->_internal[0],&b->_internal[0]); + add(&ret->_internal[1],&a->_internal[1],&b->_internal[1]); + } + }; + typedef TypePair ComplexD2; + typedef TypePair RealD2; + typedef TypePair vComplexD2; + typedef TypePair vRealD2; + + // Traits to identify fundamental data types + template struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + + ////////////////////////////////////////////////////////////////////////////////// // Want to recurse: GridTypeMapper >::scalar_type == ComplexD. // Use of a helper class like this allows us to template specialise and "dress" @@ -81,6 +136,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef RealD DoublePrecision; + typedef RealD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; @@ -93,6 +149,20 @@ NAMESPACE_BEGIN(Grid); typedef ComplexD Complexified; typedef RealD Realified; typedef RealD DoublePrecision; + typedef RealD DoublePrecision2; + }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef RealD2 scalar_type; + typedef RealD2 scalar_typeD; + typedef RealD2 vector_type; + typedef RealD2 vector_typeD; + typedef RealD2 tensor_reduced; + typedef RealD2 scalar_object; + typedef RealD2 scalar_objectD; + typedef ComplexD2 Complexified; + typedef RealD2 Realified; + typedef RealD2 DoublePrecision; + typedef RealD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; @@ -105,6 +175,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef ComplexD DoublePrecision; + typedef ComplexD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; @@ -117,6 +188,20 @@ NAMESPACE_BEGIN(Grid); typedef ComplexD Complexified; typedef RealD Realified; typedef ComplexD DoublePrecision; + typedef ComplexD DoublePrecision2; + }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef ComplexD2 scalar_type; + typedef ComplexD2 scalar_typeD; + typedef ComplexD2 vector_type; + typedef ComplexD2 vector_typeD; + typedef ComplexD2 tensor_reduced; + typedef ComplexD2 scalar_object; + typedef ComplexD2 scalar_objectD; + typedef ComplexD2 Complexified; + typedef RealD2 Realified; + typedef ComplexD2 DoublePrecision; + typedef ComplexD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; @@ -129,6 +214,7 @@ NAMESPACE_BEGIN(Grid); typedef void Complexified; typedef void Realified; typedef void DoublePrecision; + typedef void DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { @@ -142,6 +228,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexF Complexified; typedef vRealF Realified; typedef vRealD DoublePrecision; + typedef vRealD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; @@ -154,6 +241,20 @@ NAMESPACE_BEGIN(Grid); typedef vComplexD Complexified; typedef vRealD Realified; typedef vRealD DoublePrecision; + typedef vRealD DoublePrecision2; + }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef RealD2 scalar_type; + typedef RealD2 scalar_typeD; + typedef vRealD2 vector_type; + typedef vRealD2 vector_typeD; + typedef vRealD2 tensor_reduced; + typedef RealD2 scalar_object; + typedef RealD2 scalar_objectD; + typedef vComplexD2 Complexified; + typedef vRealD2 Realified; + typedef vRealD2 DoublePrecision; + typedef vRealD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types @@ -167,6 +268,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexH Complexified; typedef vRealH Realified; typedef vRealD DoublePrecision; + typedef vRealD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types @@ -180,6 +282,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexH Complexified; typedef vRealH Realified; typedef vComplexD DoublePrecision; + typedef vComplexD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; @@ -192,6 +295,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexF Complexified; typedef vRealF Realified; typedef vComplexD DoublePrecision; + typedef vComplexD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; @@ -204,6 +308,20 @@ NAMESPACE_BEGIN(Grid); typedef vComplexD Complexified; typedef vRealD Realified; typedef vComplexD DoublePrecision; + typedef vComplexD DoublePrecision2; + }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef ComplexD2 scalar_type; + typedef ComplexD2 scalar_typeD; + typedef vComplexD2 vector_type; + typedef vComplexD2 vector_typeD; + typedef vComplexD2 tensor_reduced; + typedef ComplexD2 scalar_object; + typedef ComplexD2 scalar_objectD; + typedef vComplexD2 Complexified; + typedef vRealD2 Realified; + typedef vComplexD2 DoublePrecision; + typedef vComplexD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; @@ -216,6 +334,7 @@ NAMESPACE_BEGIN(Grid); typedef void Complexified; typedef void Realified; typedef void DoublePrecision; + typedef void DoublePrecision2; }; #define GridTypeMapper_RepeatedTypes \ @@ -234,6 +353,7 @@ NAMESPACE_BEGIN(Grid); using Complexified = iScalar; using Realified = iScalar; using DoublePrecision = iScalar; + using DoublePrecision2= iScalar; static constexpr int Rank = BaseTraits::Rank + 1; static constexpr std::size_t count = BaseTraits::count; static constexpr int Dimension(int dim) { @@ -248,6 +368,7 @@ NAMESPACE_BEGIN(Grid); using Complexified = iVector; using Realified = iVector; using DoublePrecision = iVector; + using DoublePrecision2= iVector; static constexpr int Rank = BaseTraits::Rank + 1; static constexpr std::size_t count = BaseTraits::count * N; static constexpr int Dimension(int dim) { @@ -262,6 +383,7 @@ NAMESPACE_BEGIN(Grid); using Complexified = iMatrix; using Realified = iMatrix; using DoublePrecision = iMatrix; + using DoublePrecision2= iMatrix; static constexpr int Rank = BaseTraits::Rank + 2; static constexpr std::size_t count = BaseTraits::count * N * N; static constexpr int Dimension(int dim) { diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc new file mode 100644 index 00000000..83b498c2 --- /dev/null +++ b/tests/IO/Test_openqcd_io.cc @@ -0,0 +1,84 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/io/Test_openqcd_io.cc + +Copyright (C) 2015 - 2020 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +#if defined(GRID_COMMS_NONE) +#error This test requires Grid compiled with MPI +#endif + +using namespace Grid; + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + auto latt_size = GridDefaultLatt(); + + GridCartesian grid(latt_size, simd_layout, mpi_layout); + + GridParallelRNG pRNG(&grid); + + pRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); + + LatticeGaugeField Umu_ref(&grid); + LatticeGaugeField Umu_me(&grid); + LatticeGaugeField Umu_diff(&grid); + + FieldMetaData header_ref; + FieldMetaData header_me; + + Umu_ref = Zero(); + Umu_me = Zero(); + + std::string file("/home/daniel/configs/openqcd/test_16x8_pbcn6"); + + if(GridCmdOptionExists(argv, argv + argc, "--config")) { + file = GridCmdOptionPayload(argv, argv + argc, "--config"); + std::cout << "file: " << file << std::endl; + assert(!file.empty()); + } + + OpenQcdIOChromaReference::readConfiguration(Umu_ref, header_ref, file); + OpenQcdIO::readConfiguration(Umu_me, header_me, file); + + std::cout << GridLogMessage << header_ref << std::endl; + std::cout << GridLogMessage << header_me << std::endl; + + Umu_diff = Umu_ref - Umu_me; + + // clang-format off + std::cout << GridLogMessage + << "norm2(Umu_ref) = " << norm2(Umu_ref) + << " norm2(Umu_me) = " << norm2(Umu_me) + << " norm2(Umu_diff) = " << norm2(Umu_diff) << std::endl; + // clang-format on + + Grid_finalize(); +} diff --git a/tests/Test_innerproduct_norm.cc b/tests/Test_innerproduct_norm.cc new file mode 100644 index 00000000..a8718c6b --- /dev/null +++ b/tests/Test_innerproduct_norm.cc @@ -0,0 +1,126 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_innerproduct_norm.cc + +Copyright (C) 2015 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +using namespace Grid; + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + const int nIter = 100; + + // clang-format off + GridCartesian *Grid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi()); + GridCartesian *Grid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + // clang-format on + + GridParallelRNG pRNG_d(Grid_d); + GridParallelRNG pRNG_f(Grid_f); + + std::vector seeds_d({1, 2, 3, 4}); + std::vector seeds_f({5, 6, 7, 8}); + + pRNG_d.SeedFixedIntegers(seeds_d); + pRNG_f.SeedFixedIntegers(seeds_f); + + // clang-format off + LatticeFermionD x_d(Grid_d); random(pRNG_d, x_d); + LatticeFermionD y_d(Grid_d); random(pRNG_d, y_d); + LatticeFermionF x_f(Grid_f); random(pRNG_f, x_f); + LatticeFermionF y_f(Grid_f); random(pRNG_f, y_f); + // clang-format on + + GridStopWatch sw_ref; + GridStopWatch sw_res; + + { // double precision + ComplexD ip_d_ref, ip_d_res, diff_ip_d; + RealD norm2_d_ref, norm2_d_res, diff_norm2_d; + + sw_ref.Reset(); + sw_ref.Start(); + for(int i = 0; i < nIter; ++i) { + ip_d_ref = innerProduct(x_d, y_d); + norm2_d_ref = norm2(x_d); + } + sw_ref.Stop(); + + sw_res.Reset(); + sw_res.Start(); + for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_d_res, norm2_d_res, x_d, y_d); } + sw_res.Stop(); + + diff_ip_d = ip_d_ref - ip_d_res; + diff_norm2_d = norm2_d_ref - norm2_d_res; + + // clang-format off + std::cout << GridLogMessage << "Double: ip_ref = " << ip_d_ref << " ip_res = " << ip_d_res << " diff = " << diff_ip_d << std::endl; + std::cout << GridLogMessage << "Double: norm2_ref = " << norm2_d_ref << " norm2_res = " << norm2_d_res << " diff = " << diff_norm2_d << std::endl; + std::cout << GridLogMessage << "Double: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl; + // clang-format on + + assert(diff_ip_d == 0.); + assert(diff_norm2_d == 0.); + + std::cout << GridLogMessage << "Double: all checks passed" << std::endl; + } + + { // single precision + ComplexD ip_f_ref, ip_f_res, diff_ip_f; + RealD norm2_f_ref, norm2_f_res, diff_norm2_f; + + sw_ref.Reset(); + sw_ref.Start(); + for(int i = 0; i < nIter; ++i) { + ip_f_ref = innerProduct(x_f, y_f); + norm2_f_ref = norm2(x_f); + } + sw_ref.Stop(); + + sw_res.Reset(); + sw_res.Start(); + for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_f_res, norm2_f_res, x_f, y_f); } + sw_res.Stop(); + + diff_ip_f = ip_f_ref - ip_f_res; + diff_norm2_f = norm2_f_ref - norm2_f_res; + + // clang-format off + std::cout << GridLogMessage << "Single: ip_ref = " << ip_f_ref << " ip_res = " << ip_f_res << " diff = " << diff_ip_f << std::endl; + std::cout << GridLogMessage << "Single: norm2_ref = " << norm2_f_ref << " norm2_res = " << norm2_f_res << " diff = " << diff_norm2_f << std::endl; + std::cout << GridLogMessage << "Single: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl; + // clang-format on + + assert(diff_ip_f == 0.); + assert(diff_norm2_f == 0.); + + std::cout << GridLogMessage << "Single: all checks passed" << std::endl; + } + + Grid_finalize(); +}