diff --git a/.gitignore b/.gitignore index 5338acb9..40156f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -88,6 +88,7 @@ Thumbs.db # build directory # ################### build*/* +Documentation/_build # IDE related files # ##################### diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index b9594678..2fd187ff 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -442,6 +442,8 @@ public: for(int p=0; poSites()*nbasis, Nsimd, { @@ -453,7 +455,7 @@ public: StencilEntry *SE; for(int p=0;p AcceleratorViewContainer; for(int p=0;p_is_local) { @@ -754,7 +758,7 @@ public: StencilEntry *SE; for(int p=0;p_is_local) { diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index ad42f049..29f0ec4b 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -136,7 +136,7 @@ public: flops=0; usec =0; Coordinate layout(Nd,1); - sgrid = new GridCartesian(dimensions,layout,processors); + sgrid = new GridCartesian(dimensions,layout,processors,*grid); }; ~FFT ( void) { @@ -182,7 +182,7 @@ public: pencil_gd[dim] = G*processors[dim]; // Pencil global vol LxLxGxLxL per node - GridCartesian pencil_g(pencil_gd,layout,processors); + GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid); // Construct pencils typedef typename vobj::scalar_object sobj; diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h index d0b133a3..15ef95c7 100644 --- a/Grid/algorithms/iterative/SchurRedBlack.h +++ b/Grid/algorithms/iterative/SchurRedBlack.h @@ -132,6 +132,31 @@ namespace Grid { (*this)(_Matrix,in,out,guess); } + void RedBlackSource(Matrix &_Matrix, const std::vector &in, std::vector &src_o) + { + GridBase *grid = _Matrix.RedBlackGrid(); + Field tmp(grid); + int nblock = in.size(); + for(int b=0;b &in, const std::vector &sol_o, std::vector &out) + { + GridBase *grid = _Matrix.RedBlackGrid(); + Field tmp(grid); + int nblock = in.size(); + for(int b=0;b void operator()(Matrix &_Matrix, const std::vector &in, std::vector &out,Guesser &guess) { @@ -150,9 +175,11 @@ namespace Grid { //////////////////////////////////////////////// // Prepare RedBlack source //////////////////////////////////////////////// - for(int b=0;b &ret,sobj a,const Lattice &x,const Lattice & autoView( x_v , x, AcceleratorRead); autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ - auto tmp = a*x_v(ss)+y_v(ss); + auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]); coalescedWrite(ret_v[ss],tmp); }); } diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 863b2548..0928cbd7 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -125,7 +125,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) for(int k=k0; k inline void sliceSum(const Lattice &Data,std::vector< // But easily avoided by using double precision fields /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; GridBase *grid = Data.Grid(); assert(grid!=NULL); @@ -419,20 +420,19 @@ template inline void sliceSum(const Lattice &Data,std::vector< } // sum over nodes. - sobj gsum; for(int t=0;t_processor_coor[orthogdim] ) { - gsum=lsSum[lt]; + result[t]=lsSum[lt]; } else { - gsum=Zero(); + result[t]=Zero(); } - grid->GlobalSum(gsum); - - result[t]=gsum; } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); } template diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 5a26cce9..2292088c 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -364,16 +364,22 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorRead); + auto coarseData_p = &coarseData_[0]; + auto fineData_p = &fineData_[0]; + Coordinate fine_rdimensions = fine->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions; + + vobj zz = Zero(); accelerator_for(sc,coarse->oSites(),1,{ // One thread per sub block Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + vobj cd = zz; + for(int sb=0;sb &coarseData,const Lattice &fineData) for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); - coarseData_[sc]=coarseData_[sc]+fineData_[sf]; + cd=cd+fineData_p[sf]; } + coarseData_p[sc] = cd; + }); return; } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index ffec05a0..35d1b841 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -73,17 +73,17 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -102,17 +102,17 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -131,17 +131,17 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include @@ -165,17 +165,17 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -194,17 +194,17 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -223,17 +223,17 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include @@ -280,17 +280,17 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -309,17 +309,17 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -338,17 +338,17 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include ///////////////////////////////////////////////////////////////// @@ -371,17 +371,17 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -400,17 +400,17 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -429,17 +429,17 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h index 4aed13bf..e025ba41 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h @@ -74,15 +74,15 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -97,15 +97,15 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -121,15 +121,15 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -148,15 +148,15 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -171,15 +171,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -194,15 +194,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef MAYBEPERM #undef MULT_2SPIN @@ -228,14 +228,14 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeF int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -249,14 +249,14 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -273,15 +273,15 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single @@ -299,14 +299,14 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -320,14 +320,14 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -341,14 +341,14 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #endif // VEC 5D @@ -392,14 +392,14 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -413,14 +413,14 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -434,14 +434,14 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -459,14 +459,14 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -480,14 +480,14 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -501,14 +501,14 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef MAYBEPERM #undef MULT_2SPIN @@ -533,14 +533,14 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeF int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -554,14 +554,14 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -577,14 +577,14 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single @@ -602,14 +602,14 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -623,14 +623,14 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -645,14 +645,14 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #endif // VEC 5D diff --git a/Grid/serialisation/BaseIO.cc b/Grid/serialisation/BaseIO.cc new file mode 100644 index 00000000..9afc20b3 --- /dev/null +++ b/Grid/serialisation/BaseIO.cc @@ -0,0 +1,35 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/serialisation/BaseIO.h + +Copyright (C) 2015 + +Author: Michael Marshall + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +NAMESPACE_BEGIN(Grid) + +std::uint64_t EigenIO::EigenResizeCounter(0); + +NAMESPACE_END(Grid) diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h index 49406201..25481301 100644 --- a/Grid/serialisation/BaseIO.h +++ b/Grid/serialisation/BaseIO.h @@ -9,6 +9,7 @@ Author: Antonin Portelli Author: Peter Boyle Author: Guido Cossu +Author: Michael Marshall This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,6 +31,7 @@ Author: Guido Cossu #ifndef GRID_SERIALISATION_ABSTRACT_READER_H #define GRID_SERIALISATION_ABSTRACT_READER_H +#include #include #include #include @@ -110,6 +112,10 @@ namespace Grid { template inline typename std::enable_if::value, typename Traits::scalar_type *>::type getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); } + + // Counter for resized EigenTensors (poor man's substitute for allocator) + // Defined in BinaryIO.cc + extern std::uint64_t EigenResizeCounter; } // Abstract writer/reader classes //////////////////////////////////////////// @@ -497,8 +503,14 @@ namespace Grid { typename std::enable_if::value, void>::type Reader::Reshape(ETensor &t, const std::array &dims ) { +#ifdef GRID_OMP + // The memory counter is the reason this must be done from the primary thread + assert(omp_in_parallel()==0 && "Deserialisation which resizes Eigen tensor must happen from primary thread"); +#endif + EigenIO::EigenResizeCounter -= static_cast(t.size()) * sizeof(typename ETensor::Scalar); //t.reshape( dims ); t.resize( dims ); + EigenIO::EigenResizeCounter += static_cast(t.size()) * sizeof(typename ETensor::Scalar); } template diff --git a/Grid/serialisation/Hdf5IO.cc b/Grid/serialisation/Hdf5IO.cc index 77396809..d3c7061e 100644 --- a/Grid/serialisation/Hdf5IO.cc +++ b/Grid/serialisation/Hdf5IO.cc @@ -1,3 +1,34 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./Grid/serialisation/VectorUtils.h + + Copyright (C) 2015 + + Author: Antonin Portelli + Author: Peter Boyle + Author: Guido Cossu + Author: Michael Marshall + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + #include using namespace Grid; diff --git a/Grid/serialisation/Hdf5IO.h b/Grid/serialisation/Hdf5IO.h index 19537599..46cb07e1 100644 --- a/Grid/serialisation/Hdf5IO.h +++ b/Grid/serialisation/Hdf5IO.h @@ -1,3 +1,34 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./Grid/serialisation/VectorUtils.h + + Copyright (C) 2015 + + Author: Peter Boyle + Author: Antonin Portelli + Author: Guido Cossu + Author: Michael Marshall + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + #ifndef GRID_SERIALISATION_HDF5_H #define GRID_SERIALISATION_HDF5_H @@ -34,11 +65,13 @@ namespace Grid template void writeDefault(const std::string &s, const U &x); template - typename std::enable_if>::is_number, void>::type + void writeRagged(const std::string &s, const std::vector &x); + template + typename std::enable_if>::value>::type writeDefault(const std::string &s, const std::vector &x); template - typename std::enable_if>::is_number, void>::type - writeDefault(const std::string &s, const std::vector &x); + typename std::enable_if>::value>::type + writeDefault(const std::string &s, const std::vector &x) { writeRagged(s, x); } template void writeMultiDim(const std::string &s, const std::vector & Dimensions, const U * pDataRowMajor, size_t NumElements); H5NS::Group & getGroup(void); @@ -64,11 +97,13 @@ namespace Grid template void readDefault(const std::string &s, U &output); template - typename std::enable_if>::is_number, void>::type + void readRagged(const std::string &s, std::vector &x); + template + typename std::enable_if>::value>::type readDefault(const std::string &s, std::vector &x); template - typename std::enable_if>::is_number, void>::type - readDefault(const std::string &s, std::vector &x); + typename std::enable_if>::value>::type + readDefault(const std::string &s, std::vector &x) { readRagged(s, x); } template void readMultiDim(const std::string &s, std::vector &buf, std::vector &dim); H5NS::Group & getGroup(void); @@ -176,24 +211,30 @@ namespace Grid } template - typename std::enable_if>::is_number, void>::type + typename std::enable_if>::value>::type Hdf5Writer::writeDefault(const std::string &s, const std::vector &x) { - // alias to element type - typedef typename element>::type Element; - - // flatten the vector and getting dimensions - Flatten> flat(x); - std::vector dim; - const auto &flatx = flat.getFlatVector(); - for (auto &d: flat.getDim()) - dim.push_back(d); - writeMultiDim(s, dim, &flatx[0], flatx.size()); + if (isRegularShape(x)) + { + // alias to element type + using Scalar = typename is_flattenable>::type; + + // flatten the vector and getting dimensions + Flatten> flat(x); + std::vector dim; + const auto &flatx = flat.getFlatVector(); + for (auto &d: flat.getDim()) + dim.push_back(d); + writeMultiDim(s, dim, &flatx[0], flatx.size()); + } + else + { + writeRagged(s, x); + } } template - typename std::enable_if>::is_number, void>::type - Hdf5Writer::writeDefault(const std::string &s, const std::vector &x) + void Hdf5Writer::writeRagged(const std::string &s, const std::vector &x) { push(s); writeSingleAttribute(x.size(), HDF5_GRID_GUARD "vector_size", @@ -229,7 +270,7 @@ namespace Grid void Hdf5Reader::readMultiDim(const std::string &s, std::vector &buf, std::vector &dim) { // alias to element type - typedef typename element>::type Element; + using Scalar = typename is_flattenable>::type; // read the dimensions H5NS::DataSpace dataSpace; @@ -260,37 +301,44 @@ namespace Grid H5NS::DataSet dataSet; dataSet = group_.openDataSet(s); - dataSet.read(buf.data(), Hdf5Type::type()); + dataSet.read(buf.data(), Hdf5Type::type()); } else { H5NS::Attribute attribute; attribute = group_.openAttribute(s); - attribute.read(Hdf5Type::type(), buf.data()); + attribute.read(Hdf5Type::type(), buf.data()); } } template - typename std::enable_if>::is_number, void>::type + typename std::enable_if>::value>::type Hdf5Reader::readDefault(const std::string &s, std::vector &x) { - // alias to element type - typedef typename element>::type Element; + if (H5Lexists (group_.getId(), s.c_str(), H5P_DEFAULT) > 0 + && H5Aexists_by_name(group_.getId(), s.c_str(), HDF5_GRID_GUARD "vector_size", H5P_DEFAULT ) > 0) + { + readRagged(s, x); + } + else + { + // alias to element type + using Scalar = typename is_flattenable>::type; - std::vector dim; - std::vector buf; - readMultiDim( s, buf, dim ); + std::vector dim; + std::vector buf; + readMultiDim( s, buf, dim ); - // reconstruct the multidimensional vector - Reconstruct> r(buf, dim); - - x = r.getVector(); + // reconstruct the multidimensional vector + Reconstruct> r(buf, dim); + + x = r.getVector(); + } } template - typename std::enable_if>::is_number, void>::type - Hdf5Reader::readDefault(const std::string &s, std::vector &x) + void Hdf5Reader::readRagged(const std::string &s, std::vector &x) { uint64_t size; diff --git a/Grid/serialisation/MacroMagic.h b/Grid/serialisation/MacroMagic.h index 0495b91e..de456305 100644 --- a/Grid/serialisation/MacroMagic.h +++ b/Grid/serialisation/MacroMagic.h @@ -118,13 +118,13 @@ static inline std::string SerialisableClassName(void) {return std::string(#cname static constexpr bool isEnum = false; \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\ template \ -static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ +static inline void write(::Grid::Writer &WR,const std::string &s, const cname &obj){ \ push(WR,s);\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ pop(WR);\ }\ template \ -static inline void read(Reader &RD,const std::string &s, cname &obj){ \ +static inline void read(::Grid::Reader &RD,const std::string &s, cname &obj){ \ if (!push(RD,s))\ {\ std::cout << ::Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl; \ diff --git a/Grid/serialisation/VectorUtils.h b/Grid/serialisation/VectorUtils.h index dd5ff0b8..8f490c64 100644 --- a/Grid/serialisation/VectorUtils.h +++ b/Grid/serialisation/VectorUtils.h @@ -9,7 +9,8 @@ Author: Antonin Portelli Author: Peter Boyle Author: paboyle - + Author: Michael Marshall + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -236,21 +237,36 @@ namespace Grid { } } - // Vector element trait ////////////////////////////////////////////////////// - template - struct element + // is_flattenable::value is true if T is a std::vector<> which can be flattened ////////////////////// + template + struct is_flattenable : std::false_type { - typedef T type; - static constexpr bool is_number = false; + using type = T; + using grid_type = T; + static constexpr int vecRank = 0; + static constexpr bool isGridTensor = false; + static constexpr bool children_flattenable = std::is_arithmetic::value or is_complex::value; }; - + template - struct element> + struct is_flattenable::value>::type> : std::false_type { - typedef typename element::type type; - static constexpr bool is_number = std::is_arithmetic::value - or is_complex::value - or element::is_number; + using type = typename GridTypeMapper::scalar_type; + using grid_type = T; + static constexpr int vecRank = 0; + static constexpr bool isGridTensor = true; + static constexpr bool children_flattenable = true; + }; + + template + struct is_flattenable, typename std::enable_if::children_flattenable>::type> + : std::true_type + { + using type = typename is_flattenable::type; + using grid_type = typename is_flattenable::grid_type; + static constexpr bool isGridTensor = is_flattenable::isGridTensor; + static constexpr int vecRank = is_flattenable::vecRank + 1; + static constexpr bool children_flattenable = true; }; // Vector flattening utility class //////////////////////////////////////////// @@ -259,23 +275,30 @@ namespace Grid { class Flatten { public: - typedef typename element::type Element; + using Scalar = typename is_flattenable::type; + static constexpr bool isGridTensor = is_flattenable::isGridTensor; public: - explicit Flatten(const V &vector); - const V & getVector(void); - const std::vector & getFlatVector(void); - const std::vector & getDim(void); + explicit Flatten(const V &vector); + const V & getVector(void) const { return vector_; } + const std::vector & getFlatVector(void) const { return flatVector_; } + const std::vector & getDim(void) const { return dim_; } private: - void accumulate(const Element &e); - template - void accumulate(const W &v); - void accumulateDim(const Element &e); - template - void accumulateDim(const W &v); + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + accumulate(const W &e); + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + accumulate(const W &e); + template typename std::enable_if< is_flattenable::value>::type + accumulate(const W &v); + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + accumulateDim(const W &e) {} // Innermost is a scalar - do nothing + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + accumulateDim(const W &e); + template typename std::enable_if< is_flattenable::value>::type + accumulateDim(const W &v); private: - const V &vector_; - std::vector flatVector_; - std::vector dim_; + const V &vector_; + std::vector flatVector_; + std::vector dim_; }; // Class to reconstruct a multidimensional std::vector @@ -283,38 +306,57 @@ namespace Grid { class Reconstruct { public: - typedef typename element::type Element; + using Scalar = typename is_flattenable::type; + static constexpr bool isGridTensor = is_flattenable::isGridTensor; public: - Reconstruct(const std::vector &flatVector, + Reconstruct(const std::vector &flatVector, const std::vector &dim); - const V & getVector(void); - const std::vector & getFlatVector(void); - const std::vector & getDim(void); + const V & getVector(void) const { return vector_; } + const std::vector & getFlatVector(void) const { return flatVector_; } + const std::vector & getDim(void) const { return dim_; } private: - void fill(std::vector &v); - template - void fill(W &v); - void resize(std::vector &v, const unsigned int dim); - template - void resize(W &v, const unsigned int dim); + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + fill(W &v); + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + fill(W &v); + template typename std::enable_if< is_flattenable::value>::type + fill(W &v); + template typename std::enable_if< is_flattenable::value && is_flattenable::vecRank==1>::type + resize(W &v, const unsigned int dim); + template typename std::enable_if< is_flattenable::value && (is_flattenable::vecRank>1)>::type + resize(W &v, const unsigned int dim); + template typename std::enable_if::isGridTensor>::type + checkInnermost(const W &e) {} // Innermost is a scalar - do nothing + template typename std::enable_if< is_flattenable::isGridTensor>::type + checkInnermost(const W &e); private: - V vector_; - const std::vector &flatVector_; - std::vector dim_; - size_t ind_{0}; - unsigned int dimInd_{0}; + V vector_; + const std::vector &flatVector_; + std::vector dim_; + size_t ind_{0}; + unsigned int dimInd_{0}; }; // Flatten class template implementation template - void Flatten::accumulate(const Element &e) + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + Flatten::accumulate(const W &e) { flatVector_.push_back(e); } template - template - void Flatten::accumulate(const W &v) + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + Flatten::accumulate(const W &e) + { + for (const Scalar &x: e) { + flatVector_.push_back(x); + } + } + + template + template typename std::enable_if::value>::type + Flatten::accumulate(const W &v) { for (auto &e: v) { @@ -323,11 +365,17 @@ namespace Grid { } template - void Flatten::accumulateDim(const Element &e) {}; + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + Flatten::accumulateDim(const W &e) + { + using Traits = GridTypeMapper::grid_type>; + for (int rank=0; rank < Traits::Rank; ++rank) + dim_.push_back(Traits::Dimension(rank)); + } template - template - void Flatten::accumulateDim(const W &v) + template typename std::enable_if::value>::type + Flatten::accumulateDim(const W &v) { dim_.push_back(v.size()); accumulateDim(v[0]); @@ -337,42 +385,36 @@ namespace Grid { Flatten::Flatten(const V &vector) : vector_(vector) { - accumulate(vector_); accumulateDim(vector_); - } - - template - const V & Flatten::getVector(void) - { - return vector_; - } - - template - const std::vector::Element> & - Flatten::getFlatVector(void) - { - return flatVector_; - } - - template - const std::vector & Flatten::getDim(void) - { - return dim_; + std::size_t TotalSize{ dim_[0] }; + for (int i = 1; i < dim_.size(); ++i) { + TotalSize *= dim_[i]; + } + flatVector_.reserve(TotalSize); + accumulate(vector_); } // Reconstruct class template implementation template - void Reconstruct::fill(std::vector &v) + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + Reconstruct::fill(W &v) + { + v = flatVector_[ind_++]; + } + + template + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + Reconstruct::fill(W &v) { for (auto &e: v) { e = flatVector_[ind_++]; } } - + template - template - void Reconstruct::fill(W &v) + template typename std::enable_if::value>::type + Reconstruct::fill(W &v) { for (auto &e: v) { @@ -381,14 +423,15 @@ namespace Grid { } template - void Reconstruct::resize(std::vector &v, const unsigned int dim) + template typename std::enable_if::value && is_flattenable::vecRank==1>::type + Reconstruct::resize(W &v, const unsigned int dim) { v.resize(dim_[dim]); } template - template - void Reconstruct::resize(W &v, const unsigned int dim) + template typename std::enable_if::value && (is_flattenable::vecRank>1)>::type + Reconstruct::resize(W &v, const unsigned int dim) { v.resize(dim_[dim]); for (auto &e: v) @@ -398,34 +441,31 @@ namespace Grid { } template - Reconstruct::Reconstruct(const std::vector &flatVector, + template typename std::enable_if::isGridTensor>::type + Reconstruct::checkInnermost(const W &) + { + using Traits = GridTypeMapper::grid_type>; + const int gridRank{Traits::Rank}; + const int dimRank{static_cast(dim_.size())}; + assert(dimRank >= gridRank && "Tensor rank too low for Grid tensor"); + for (int i=0; i + Reconstruct::Reconstruct(const std::vector &flatVector, const std::vector &dim) : flatVector_(flatVector) , dim_(dim) { + checkInnermost(vector_); + assert(dim_.size() == is_flattenable::vecRank && "Tensor rank doesn't match nested std::vector rank"); resize(vector_, 0); fill(vector_); } - template - const V & Reconstruct::getVector(void) - { - return vector_; - } - - template - const std::vector::Element> & - Reconstruct::getFlatVector(void) - { - return flatVector_; - } - - template - const std::vector & Reconstruct::getDim(void) - { - return dim_; - } - // Vector IO utilities /////////////////////////////////////////////////////// // helper function to read space-separated values template @@ -459,6 +499,64 @@ namespace Grid { return os; } + + // In general, scalar types are considered "flattenable" (regularly shaped) + template + bool isRegularShapeHelper(const std::vector &, std::vector &, int, bool) + { + return true; + } + + template + bool isRegularShapeHelper(const std::vector> &v, std::vector &Dims, int Depth, bool bFirst) + { + if( bFirst) + { + assert( Dims.size() == Depth && "Bug: Delete this message after testing" ); + Dims.push_back(v[0].size()); + if (!Dims[Depth]) + return false; + } + else + { + assert( Dims.size() >= Depth + 1 && "Bug: Delete this message after testing" ); + } + for (std::size_t i = 0; i < v.size(); ++i) + { + if (v[i].size() != Dims[Depth] || !isRegularShapeHelper(v[i], Dims, Depth + 1, bFirst && i==0)) + { + return false; + } + } + return true; + } + + template + bool isRegularShape(const T &t) { return true; } + + template + bool isRegularShape(const std::vector &v) { return !v.empty(); } + + // Return non-zero if all dimensions of this std::vector> are regularly shaped + template + bool isRegularShape(const std::vector> &v) + { + if (v.empty() || v[0].empty()) + return false; + // Make sure all of my rows are the same size + std::vector Dims; + Dims.reserve(is_flattenable::vecRank); + Dims.push_back(v.size()); + Dims.push_back(v[0].size()); + for (std::size_t i = 0; i < Dims[0]; ++i) + { + if (v[i].size() != Dims[1] || !isRegularShapeHelper(v[i], Dims, 2, i==0)) + { + return false; + } + } + return true; + } } // helper function to read space-separated values diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index 36becc49..be045ede 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -417,7 +417,7 @@ public: stream << "{"; for (int j = 0; j < N; j++) { stream << o._internal[i][j]; - if (i < N - 1) stream << ","; + if (j < N - 1) stream << ","; } stream << "}"; if (i != N - 1) stream << "\n\t\t"; diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 9c40f538..01a0dd22 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -83,11 +83,11 @@ void acceleratorInit(void) printf("AcceleratorCudaInit: using default device \n"); printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n"); printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); - printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n"); } #else printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); - printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n"); + printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n"); cudaSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index 03f3ee61..d48486c0 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -182,7 +182,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); diff --git a/configure.ac b/configure.ac index 721d890e..406b0b74 100644 --- a/configure.ac +++ b/configure.ac @@ -390,6 +390,7 @@ case ${CXXTEST} in CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" if test $ac_openmp = yes; then CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" + LDFLAGS="$LDFLAGS -Xcompiler -fopenmp" fi ;; hipcc) diff --git a/documentation/Grid.pdf b/documentation/Grid.pdf index 868c6db4..df3304eb 100644 Binary files a/documentation/Grid.pdf and b/documentation/Grid.pdf differ diff --git a/documentation/manual.rst b/documentation/manual.rst index d51f07c1..e545bdaf 100644 --- a/documentation/manual.rst +++ b/documentation/manual.rst @@ -1787,7 +1787,7 @@ Hdf5Writer Hdf5Reader HDF5 Write interfaces, similar to the XML facilities in QDP++ are presented. However, the serialisation routines are automatically generated by the macro, and a virtual -reader adn writer interface enables writing to any of a number of formats. +reader and writer interface enables writing to any of a number of formats. **Example**:: @@ -1814,6 +1814,91 @@ reader adn writer interface enables writing to any of a number of formats. } +Eigen tensor support -- added 2019H1 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Serialisation library was expanded in 2019 to support de/serialisation of +Eigen tensors. De/serialisation of existing types was not changed. Data files +without Eigen tensors remain compatible with earlier versions of Grid and other readers. +Conversely, data files containing serialised Eigen tensors is a breaking change. + +Eigen tensor serialisation support was added to BaseIO, which was modified to provide a Traits class +to recognise Eigen tensors with elements that are either: primitive scalars (arithmetic and complex types); +or Grid tensors. + +**Traits determining de/serialisable scalars**:: + + // Is this an Eigen tensor + template struct is_tensor : std::integral_constant, T>::value> {}; + // Is this an Eigen tensor of a supported scalar + template struct is_tensor_of_scalar : public std::false_type {}; + template struct is_tensor_of_scalar::value && is_scalar::value>::type> : public std::true_type {}; + // Is this an Eigen tensor of a supported container + template struct is_tensor_of_container : public std::false_type {}; + template struct is_tensor_of_container::value && isGridTensor::value>::type> : public std::true_type {}; + + +Eigen tensors are regular, multidimensional objects, and each Reader/Writer +was extended to support this new datatype. Where the Eigen tensor contains +a Grid tensor, the dimensions of the data written are the dimensions of the +Eigen tensor plus the dimensions of the underlying Grid scalar. Dimensions +of size 1 are preserved. + +**New Reader/Writer methods for multi-dimensional data**:: + + template + void readMultiDim(const std::string &s, std::vector &buf, std::vector &dim); + template + void writeMultiDim(const std::string &s, const std::vector & Dimensions, const U * pDataRowMajor, size_t NumElements); + + +On readback, the Eigen tensor rank must match the data being read, but the tensor +dimensions will be resized if necessary. Resizing is not possible for Eigen::TensorMap +because these tensors use a buffer provided at construction, and this buffer cannot be changed. +Deserialisation failures cause Grid to assert. + + +HDF5 Optimisations -- added June 2021 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Grid serialisation is intended to be light, deterministic and provide a layer of abstraction over +multiple file formats. HDF5 excels at handling multi-dimensional data, and the Grid HDF5Reader/HDF5Writer exploits this. +When serialising nested ``std::vector``, where ``T`` is an arithmetic or complex type, +the Hdf5Writer writes the data as an Hdf5 DataSet object. + +However, nested ``std::vector>`` might be "ragged", i.e. not necessarily regular. E.g. a 3d nested +``std::vector`` might contain 2 rows, the first being a 2x2 block and the second row being a 1 x 2 block. +A bug existed whereby this was not checked on write, so nested, ragged vectors +were written as a regular dataset, with a buffer under/overrun and jumbled contents. + +Clearly this was not used in production, as the bug went undetected until now. Fixing this bug +is an opportunity to further optimise the HDF5 file format. + +The goals of this change are to: + +* Make changes to the Hdf5 file format only -- i.e. do not impact other file formats + +* Implement file format changes in such a way that they are transparent to the Grid reader + +* Correct the bug for ragged vectors of numeric / complex types + +* Extend the support of nested std::vector to arbitrarily nested Grid tensors + + +The trait class ``element`` has been redefined to ``is_flattenable``, which is a trait class for +potentially "flattenable" objects. These are (possibly nested) ``std::vector`` where ``T`` is +an arithmetic, complex or Grid tensor type. Flattenable objects are tested on write +(with the function ``isRegularShape``) to see whether they actually are regular. + +Flattenable, regular objects are written to a multidimensional HDF5 DataSet. +Otherwise, an Hdf5 sub group is created with the object "name", and each element of the outer dimension is +recursively written to as object "name_n", where n is a 0-indexed number. + +On readback (by Grid)), the presence of a subgroup containing the attribute ``Grid_vector_size`` triggers a +"ragged read", otherwise a read from a DataSet is attempted. + + Data parallel field IO ----------------------- diff --git a/systems/Tursa/config-command b/systems/Tursa/config-command new file mode 100644 index 00000000..b47c34e5 --- /dev/null +++ b/systems/Tursa/config-command @@ -0,0 +1,12 @@ +../../configure \ + --enable-comms=mpi \ + --enable-simd=GPU \ + --enable-shm=nvlink \ + --enable-gen-simd-width=64 \ + --enable-accelerator=cuda \ + --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \ + --disable-accelerator-cshift \ + --disable-unified \ + CXX=nvcc \ + LDFLAGS="-cudart shared " \ + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" diff --git a/systems/Tursa/dwf.16node.perf b/systems/Tursa/dwf.16node.perf new file mode 100644 index 00000000..a51aae94 --- /dev/null +++ b/systems/Tursa/dwf.16node.perf @@ -0,0 +1,293 @@ +tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 64 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7f05c0000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=9d2238148c56e3fbadfa95dcabf2b83d4bde14cd: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34004218675 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 1.814936 s : Grid Layout +Grid : Message : 1.814947 s : Global lattice size : 64 64 64 256 +Grid : Message : 1.814952 s : OpenMP threads : 4 +Grid : Message : 1.814955 s : MPI tasks : 2 2 2 8 +Grid : Message : 1.859229 s : Making s innermost grids +Grid : Message : 1.907983 s : Initialising 4d RNG +Grid : Message : 1.999619 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 1.999657 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 3.786102 s : Initialising 5d RNG +Grid : Message : 5.361999 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 5.362036 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 38.698345 s : Initialised RNGs +Grid : Message : 42.821728 s : Drawing gauge field +Grid : Message : 43.916364 s : Random gauge initialised +Grid : Message : 46.410003 s : Setting up Cshift based reference +Grid : Message : 54.242661 s : ***************************************************************** +Grid : Message : 54.242686 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 54.242688 s : ***************************************************************** +Grid : Message : 54.242689 s : ***************************************************************** +Grid : Message : 54.242690 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 54.242691 s : * Vectorising space-time by 8 +Grid : Message : 54.242692 s : * VComplexF size is 64 B +Grid : Message : 54.242694 s : * SINGLE precision +Grid : Message : 54.242697 s : * Using Overlapped Comms/Compute +Grid : Message : 54.242698 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 54.242699 s : ***************************************************************** +Grid : Message : 56.314112 s : Called warmup +Grid : Message : 84.246354 s : Called Dw 3000 times in 2.79318e+07 us +Grid : Message : 84.246405 s : mflop/s = 1.52229e+08 +Grid : Message : 84.246408 s : mflop/s per rank = 2.37857e+06 +Grid : Message : 84.246412 s : mflop/s per node = 9.51428e+06 +Grid : Message : 84.246414 s : RF GiB/s (base 2) = 309325 +Grid : Message : 84.246417 s : mem GiB/s (base 2) = 193328 +Grid : Message : 84.250016 s : norm diff 1.03478e-13 +Grid : Message : 84.285132 s : #### Dhop calls report +Grid : Message : 84.285137 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 84.285140 s : WilsonFermion5D TotalTime /Calls : 4703.27 us +Grid : Message : 84.285142 s : WilsonFermion5D CommTime /Calls : 3131.05 us +Grid : Message : 84.285144 s : WilsonFermion5D FaceTime /Calls : 492.972 us +Grid : Message : 84.285146 s : WilsonFermion5D ComputeTime1/Calls : 56.9085 us +Grid : Message : 84.285148 s : WilsonFermion5D ComputeTime2/Calls : 1099.95 us +Grid : Message : 84.285160 s : Average mflops/s per call : 1.43412e+10 +Grid : Message : 84.285165 s : Average mflops/s per call per rank : 2.24082e+08 +Grid : Message : 84.285170 s : Average mflops/s per call per node : 8.96328e+08 +Grid : Message : 84.285173 s : Average mflops/s per call (full) : 1.53416e+08 +Grid : Message : 84.285176 s : Average mflops/s per call per rank (full): 2.39712e+06 +Grid : Message : 84.285194 s : Average mflops/s per call per node (full): 9.58847e+06 +Grid : Message : 84.285197 s : WilsonFermion5D Stencil +Grid : Message : 84.285271 s : Stencil calls 3001 +Grid : Message : 84.285275 s : Stencil halogtime 0 +Grid : Message : 84.285277 s : Stencil gathertime 55.2059 +Grid : Message : 84.285281 s : Stencil gathermtime 20.0923 +Grid : Message : 84.285283 s : Stencil mergetime 18.9057 +Grid : Message : 84.285286 s : Stencil decompresstime 0.0619793 +Grid : Message : 84.285289 s : Stencil comms_bytes 4.02653e+08 +Grid : Message : 84.285292 s : Stencil commtime 6323.57 +Grid : Message : 84.285295 s : Stencil 63.675 GB/s per rank +Grid : Message : 84.285298 s : Stencil 254.7 GB/s per node +Grid : Message : 84.285301 s : WilsonFermion5D StencilEven +Grid : Message : 84.285316 s : WilsonFermion5D StencilOdd +Grid : Message : 84.285333 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 84.285336 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 84.285337 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 106.985790 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 106.985814 s : Called DwDag +Grid : Message : 106.985815 s : norm dag result 12.0421 +Grid : Message : 107.188790 s : norm dag ref 12.0421 +Grid : Message : 107.349010 s : norm dag diff 7.63254e-14 +Grid : Message : 107.762980 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 107.458374 s : src_e0.499998 +Grid : Message : 107.754073 s : src_o0.500002 +Grid : Message : 107.855191 s : ********************************************************* +Grid : Message : 107.855194 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 107.855195 s : * Vectorising space-time by 8 +Grid : Message : 107.855197 s : * SINGLE precision +Grid : Message : 107.855198 s : * Using Overlapped Comms/Compute +Grid : Message : 107.855199 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 107.855200 s : ********************************************************* +Grid : Message : 121.549348 s : Deo mflop/s = 1.56492e+08 +Grid : Message : 121.549382 s : Deo mflop/s per rank 2.44518e+06 +Grid : Message : 121.549384 s : Deo mflop/s per node 9.78074e+06 +Grid : Message : 121.549387 s : #### Dhop calls report +Grid : Message : 121.549388 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 121.549390 s : WilsonFermion5D TotalTime /Calls : 4563.01 us +Grid : Message : 121.549393 s : WilsonFermion5D CommTime /Calls : 2967.77 us +Grid : Message : 121.549395 s : WilsonFermion5D FaceTime /Calls : 601.095 us +Grid : Message : 121.549397 s : WilsonFermion5D ComputeTime1/Calls : 59.9877 us +Grid : Message : 121.549399 s : WilsonFermion5D ComputeTime2/Calls : 1038.46 us +Grid : Message : 121.549423 s : Average mflops/s per call : 1.2726e+10 +Grid : Message : 121.549428 s : Average mflops/s per call per rank : 1.98843e+08 +Grid : Message : 121.549430 s : Average mflops/s per call per node : 7.95373e+08 +Grid : Message : 121.549432 s : Average mflops/s per call (full) : 1.58131e+08 +Grid : Message : 121.549436 s : Average mflops/s per call per rank (full): 2.4708e+06 +Grid : Message : 121.549440 s : Average mflops/s per call per node (full): 9.88321e+06 +Grid : Message : 121.549442 s : WilsonFermion5D Stencil +Grid : Message : 121.549453 s : WilsonFermion5D StencilEven +Grid : Message : 121.549472 s : WilsonFermion5D StencilOdd +Grid : Message : 121.549484 s : Stencil calls 3001 +Grid : Message : 121.549490 s : Stencil halogtime 0 +Grid : Message : 121.549492 s : Stencil gathertime 55.2206 +Grid : Message : 121.549496 s : Stencil gathermtime 19.4562 +Grid : Message : 121.549500 s : Stencil mergetime 18.3469 +Grid : Message : 121.549502 s : Stencil decompresstime 0.0646451 +Grid : Message : 121.549506 s : Stencil comms_bytes 2.01327e+08 +Grid : Message : 121.549510 s : Stencil commtime 2979.17 +Grid : Message : 121.549512 s : Stencil 67.5782 GB/s per rank +Grid : Message : 121.549514 s : Stencil 270.313 GB/s per node +Grid : Message : 121.549517 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 121.549519 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 121.549522 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 121.625928 s : r_e6.02108 +Grid : Message : 121.634489 s : r_o6.02101 +Grid : Message : 121.640496 s : res12.0421 +Grid : Message : 122.275455 s : norm diff 0 +Grid : Message : 123.135840 s : norm diff even 0 +Grid : Message : 123.389190 s : norm diff odd 0 diff --git a/systems/Tursa/dwf.4node.perf b/systems/Tursa/dwf.4node.perf new file mode 100644 index 00000000..9073969e --- /dev/null +++ b/systems/Tursa/dwf.4node.perf @@ -0,0 +1,245 @@ +tu-c0r0n00 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n00 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n09 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n00 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n06 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n06 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n09 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n09 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n03 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n06 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n09 - 3 device=3 binding=--interleave=6,7 +tu-c0r0n00 - 3 device=3 binding=--interleave=6,7 +tu-c0r0n03 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n03 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n06 - 3 device=3 binding=--interleave=6,7 +tu-c0r0n03 - 3 device=3 binding=--interleave=6,7 +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fcd80000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=9d2238148c56e3fbadfa95dcabf2b83d4bde14cd: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34004218675 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 1.198523 s : Grid Layout +Grid : Message : 1.198530 s : Global lattice size : 64 64 64 64 +Grid : Message : 1.198534 s : OpenMP threads : 4 +Grid : Message : 1.198535 s : MPI tasks : 2 2 2 2 +Grid : Message : 1.397615 s : Making s innermost grids +Grid : Message : 1.441828 s : Initialising 4d RNG +Grid : Message : 1.547973 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 1.547998 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 1.954777 s : Initialising 5d RNG +Grid : Message : 3.633825 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 3.633869 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 12.162710 s : Initialised RNGs +Grid : Message : 15.882520 s : Drawing gauge field +Grid : Message : 15.816362 s : Random gauge initialised +Grid : Message : 17.279671 s : Setting up Cshift based reference +Grid : Message : 26.331426 s : ***************************************************************** +Grid : Message : 26.331452 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 26.331454 s : ***************************************************************** +Grid : Message : 26.331456 s : ***************************************************************** +Grid : Message : 26.331458 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 26.331459 s : * Vectorising space-time by 8 +Grid : Message : 26.331463 s : * VComplexF size is 64 B +Grid : Message : 26.331465 s : * SINGLE precision +Grid : Message : 26.331467 s : * Using Overlapped Comms/Compute +Grid : Message : 26.331468 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 26.331469 s : ***************************************************************** +Grid : Message : 28.413717 s : Called warmup +Grid : Message : 56.418423 s : Called Dw 3000 times in 2.80047e+07 us +Grid : Message : 56.418476 s : mflop/s = 3.79581e+07 +Grid : Message : 56.418479 s : mflop/s per rank = 2.37238e+06 +Grid : Message : 56.418481 s : mflop/s per node = 9.48953e+06 +Grid : Message : 56.418483 s : RF GiB/s (base 2) = 77130 +Grid : Message : 56.418485 s : mem GiB/s (base 2) = 48206.3 +Grid : Message : 56.422076 s : norm diff 1.03481e-13 +Grid : Message : 56.456894 s : #### Dhop calls report +Grid : Message : 56.456899 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 56.456903 s : WilsonFermion5D TotalTime /Calls : 4710.93 us +Grid : Message : 56.456905 s : WilsonFermion5D CommTime /Calls : 3196.15 us +Grid : Message : 56.456908 s : WilsonFermion5D FaceTime /Calls : 494.392 us +Grid : Message : 56.456910 s : WilsonFermion5D ComputeTime1/Calls : 44.4107 us +Grid : Message : 56.456912 s : WilsonFermion5D ComputeTime2/Calls : 1037.75 us +Grid : Message : 56.456921 s : Average mflops/s per call : 3.55691e+09 +Grid : Message : 56.456925 s : Average mflops/s per call per rank : 2.22307e+08 +Grid : Message : 56.456928 s : Average mflops/s per call per node : 8.89228e+08 +Grid : Message : 56.456930 s : Average mflops/s per call (full) : 3.82915e+07 +Grid : Message : 56.456933 s : Average mflops/s per call per rank (full): 2.39322e+06 +Grid : Message : 56.456952 s : Average mflops/s per call per node (full): 9.57287e+06 +Grid : Message : 56.456954 s : WilsonFermion5D Stencil +Grid : Message : 56.457016 s : Stencil calls 3001 +Grid : Message : 56.457022 s : Stencil halogtime 0 +Grid : Message : 56.457024 s : Stencil gathertime 55.9154 +Grid : Message : 56.457026 s : Stencil gathermtime 20.1073 +Grid : Message : 56.457028 s : Stencil mergetime 18.5585 +Grid : Message : 56.457030 s : Stencil decompresstime 0.0639787 +Grid : Message : 56.457032 s : Stencil comms_bytes 4.02653e+08 +Grid : Message : 56.457034 s : Stencil commtime 6379.93 +Grid : Message : 56.457036 s : Stencil 63.1124 GB/s per rank +Grid : Message : 56.457038 s : Stencil 252.45 GB/s per node +Grid : Message : 56.457040 s : WilsonFermion5D StencilEven +Grid : Message : 56.457048 s : WilsonFermion5D StencilOdd +Grid : Message : 56.457062 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 56.457065 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 56.457066 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 79.259261 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 79.259287 s : Called DwDag +Grid : Message : 79.259288 s : norm dag result 12.0421 +Grid : Message : 79.271740 s : norm dag ref 12.0421 +Grid : Message : 79.287759 s : norm dag diff 7.63236e-14 +Grid : Message : 79.328100 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 79.955951 s : src_e0.499997 +Grid : Message : 80.633620 s : src_o0.500003 +Grid : Message : 80.164163 s : ********************************************************* +Grid : Message : 80.164168 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 80.164170 s : * Vectorising space-time by 8 +Grid : Message : 80.164172 s : * SINGLE precision +Grid : Message : 80.164174 s : * Using Overlapped Comms/Compute +Grid : Message : 80.164177 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 80.164178 s : ********************************************************* +Grid : Message : 93.797635 s : Deo mflop/s = 3.93231e+07 +Grid : Message : 93.797670 s : Deo mflop/s per rank 2.45769e+06 +Grid : Message : 93.797672 s : Deo mflop/s per node 9.83077e+06 +Grid : Message : 93.797674 s : #### Dhop calls report +Grid : Message : 93.797675 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 93.797677 s : WilsonFermion5D TotalTime /Calls : 4542.83 us +Grid : Message : 93.797679 s : WilsonFermion5D CommTime /Calls : 2978.97 us +Grid : Message : 93.797681 s : WilsonFermion5D FaceTime /Calls : 602.287 us +Grid : Message : 93.797683 s : WilsonFermion5D ComputeTime1/Calls : 67.1416 us +Grid : Message : 93.797685 s : WilsonFermion5D ComputeTime2/Calls : 1004.07 us +Grid : Message : 93.797713 s : Average mflops/s per call : 3.30731e+09 +Grid : Message : 93.797717 s : Average mflops/s per call per rank : 2.06707e+08 +Grid : Message : 93.797719 s : Average mflops/s per call per node : 8.26827e+08 +Grid : Message : 93.797721 s : Average mflops/s per call (full) : 3.97084e+07 +Grid : Message : 93.797727 s : Average mflops/s per call per rank (full): 2.48178e+06 +Grid : Message : 93.797732 s : Average mflops/s per call per node (full): 9.92711e+06 +Grid : Message : 93.797735 s : WilsonFermion5D Stencil +Grid : Message : 93.797746 s : WilsonFermion5D StencilEven +Grid : Message : 93.797758 s : WilsonFermion5D StencilOdd +Grid : Message : 93.797769 s : Stencil calls 3001 +Grid : Message : 93.797773 s : Stencil halogtime 0 +Grid : Message : 93.797776 s : Stencil gathertime 56.7458 +Grid : Message : 93.797780 s : Stencil gathermtime 22.6504 +Grid : Message : 93.797782 s : Stencil mergetime 21.1913 +Grid : Message : 93.797786 s : Stencil decompresstime 0.0556481 +Grid : Message : 93.797788 s : Stencil comms_bytes 2.01327e+08 +Grid : Message : 93.797791 s : Stencil commtime 2989.33 +Grid : Message : 93.797795 s : Stencil 67.3484 GB/s per rank +Grid : Message : 93.797798 s : Stencil 269.394 GB/s per node +Grid : Message : 93.797801 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 93.797803 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 93.797805 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 93.873429 s : r_e6.02111 +Grid : Message : 93.879931 s : r_o6.02102 +Grid : Message : 93.885912 s : res12.0421 +Grid : Message : 94.876555 s : norm diff 0 +Grid : Message : 95.485643 s : norm diff even 0 +Grid : Message : 95.581236 s : norm diff odd 0 diff --git a/systems/Tursa/dwf16.slurm b/systems/Tursa/dwf16.slurm new file mode 100644 index 00000000..a35e55be --- /dev/null +++ b/systems/Tursa/dwf16.slurm @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH -J dslash +#SBATCH -A tc002 +#SBATCH -t 2:20:00 +#SBATCH --exclusive +#SBATCH --nodes=16 +#SBATCH --ntasks=64 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --time=12:00:00 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:4 +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n +OPT="--comms-overlap --comms-concurrent" + + +mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH --bind-to none ./mpiwrapper.sh \ + ./benchmarks/Benchmark_dwf_fp32 \ + $OPT \ + --mpi 2.2.2.8 \ + --accelerator-threads 8 \ + --grid 64.64.64.256 \ + --shm 2048 > dwf.16node.perf + diff --git a/systems/Tursa/dwf4.slurm b/systems/Tursa/dwf4.slurm new file mode 100644 index 00000000..65191398 --- /dev/null +++ b/systems/Tursa/dwf4.slurm @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH -J dslash +#SBATCH -A tc002 +#SBATCH -t 2:20:00 +#SBATCH --nodelist=tu-c0r0n[00,03,06,09] +#SBATCH --exclusive +#SBATCH --nodes=4 +#SBATCH --ntasks=16 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --time=12:00:00 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:4 +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n +OPT="--comms-overlap --comms-concurrent" + + +mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH --bind-to none \ + ./mpiwrapper.sh \ + ./benchmarks/Benchmark_dwf_fp32 \ + $OPT \ + --mpi 2.2.2.2 \ + --accelerator-threads 8 \ + --grid 64.64.64.64 \ + --shm 2048 > dwf.4node.perf + + + + diff --git a/systems/Tursa/mpiwrapper.sh b/systems/Tursa/mpiwrapper.sh new file mode 100755 index 00000000..4d96ac67 --- /dev/null +++ b/systems/Tursa/mpiwrapper.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +lrank=$OMPI_COMM_WORLD_LOCAL_RANK +numa1=$(( 2 * $lrank)) +numa2=$(( 2 * $lrank + 1 )) +netdev=mlx5_${lrank}:1 + +export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK +export UCX_NET_DEVICES=mlx5_${lrank}:1 +BINDING="--interleave=$numa1,$numa2" + +echo "`hostname` - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING" + +numactl ${BINDING} $* + + + diff --git a/systems/Tursa/sourceme.sh b/systems/Tursa/sourceme.sh new file mode 100644 index 00000000..6286750d --- /dev/null +++ b/systems/Tursa/sourceme.sh @@ -0,0 +1,2 @@ +spack load c-lime +module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1 diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc index 27fe589e..e1596ea6 100644 --- a/tests/IO/Test_serialisation.cc +++ b/tests/IO/Test_serialisation.cc @@ -48,7 +48,9 @@ public: std::vector, array, std::vector >, twodimarray, std::vector> > >, cmplx3darray, - SpinColourMatrix, scm + SpinColourMatrix, scm, + std::vector > >, ragged, + std::vector >, vscm ); myclass() {} myclass(int i) @@ -56,6 +58,10 @@ public: , twodimarray(3,std::vector(5, 1.23456)) , cmplx3darray(3,std::vector>>(5, std::vector>(7, std::complex(1.2, 3.4)))) , ve(2, myenum::blue) + , ragged( {{{i+1},{i+2,i+3}}, // ragged + {{i+4,i+5,i+6,i+7},{i+8,i+9,i+10,i+11},{i+12,i+13,i+14,i+15}}, // block + {{i+16,i+17},{i+18,i+19,i+20}}} ) //ragged + , vscm(3, std::vector(5)) { e=myenum::red; x=i; @@ -68,6 +74,13 @@ public: scm()(0, 2)(1, 1) = 6.336; scm()(2, 1)(2, 2) = 7.344; scm()(1, 1)(2, 0) = 8.3534; + int Counter = i; + for( auto & v : vscm ) { + for( auto & j : v ) { + j = std::complex(Counter, -Counter); + Counter++; + } + } } }; diff --git a/tests/Test_meson_field.cc b/tests/Test_meson_field.cc new file mode 100644 index 00000000..25d908d7 --- /dev/null +++ b/tests/Test_meson_field.cc @@ -0,0 +1,148 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: tests/core/Test_meson_field.cc + +Copyright (C) 2015-2018 + +Author: Felix Erben + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ + +#include +#include + +using namespace Grid; + +const int TSRC = 0; //timeslice where rho is nonzero +const int VDIM = 5; //length of each vector + +typedef typename DomainWallFermionR::ComplexField ComplexField; +typedef typename DomainWallFermionR::FermionField FermionField; + +int main(int argc, char *argv[]) +{ + // initialization + Grid_init(&argc, &argv); + std::cout << GridLogMessage << "Grid initialized" << std::endl; + + // Lattice and rng setup + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(4, vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian grid(latt_size,simd_layout,mpi_layout); + int Nt = GridDefaultLatt()[Tp]; + Lattice> t(&grid); + LatticeCoordinate(t, Tp); + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&grid); + pRNG.SeedFixedIntegers(seeds); + + // MesonField lhs and rhs vectors + std::vector phi(VDIM,&grid); + std::vector rho(VDIM,&grid); + FermionField rho_tmp(&grid); + std::cout << GridLogMessage << "Initialising random meson fields" << std::endl; + for (unsigned int i = 0; i < VDIM; ++i){ + random(pRNG,phi[i]); + random(pRNG,rho_tmp); //ideally only nonzero on t=0 + rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0 + } + std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl; + + // Gamma matrices used in the contraction + std::vector Gmu = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + // momentum phases e^{ipx} + std::vector> momenta = { + {0.,0.,0.}, + {1.,0.,0.}, + {1.,1.,0.}, + {1.,1.,1.}, + {2.,0.,0.} + }; + + std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl; + + std::cout << GridLogMessage << "Computing complex phases" << std::endl; + std::vector phases(momenta.size(),&grid); + ComplexField coor(&grid); + Complex Ci(0.0,1.0); + for (unsigned int j = 0; j < momenta.size(); ++j) + { + phases[j] = Zero(); + for(unsigned int mu = 0; mu < momenta[j].size(); mu++) + { + LatticeCoordinate(coor, mu); + phases[j] = phases[j] + momenta[j][mu]/GridDefaultLatt()[mu]*coor; + } + phases[j] = exp((Real)(2*M_PI)*Ci*phases[j]); + } + std::cout << GridLogMessage << "Computing complex phases done." << std::endl; + + Eigen::Tensor Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + + // timer + double start,stop; + + //execute meson field routine + start = usecond(); + A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl; + start = usecond(); + /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices) + * to the routine so that all the compnents which are predictably equal to zero are not computed. */ + A2Autils::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl; + start = usecond(); + A2Autils::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl; + + std::string FileName = "Meson_Fields"; +#ifdef HAVE_HDF5 + using Default_Reader = Grid::Hdf5Reader; + using Default_Writer = Grid::Hdf5Writer; + FileName.append(".h5"); +#else + using Default_Reader = Grid::BinaryReader; + using Default_Writer = Grid::BinaryWriter; + FileName.append(".bin"); +#endif + + Default_Writer w(FileName); + write(w,"phi_phi",Mpp); + write(w,"phi_rho",Mpr); + write(w,"rho_rho",Mrr); + + // epilogue + std::cout << GridLogMessage << "Grid is finalizing now" << std::endl; + Grid_finalize(); + + return EXIT_SUCCESS; +}