diff --git a/benchmarks/Benchmark_memory_bandwidth.cc b/benchmarks/Benchmark_memory_bandwidth.cc index 848f271d..cc965050 100644 --- a/benchmarks/Benchmark_memory_bandwidth.cc +++ b/benchmarks/Benchmark_memory_bandwidth.cc @@ -55,7 +55,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; @@ -85,7 +85,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; @@ -116,7 +116,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; @@ -147,7 +147,7 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; @@ -171,8 +171,6 @@ int main (int argc, char ** argv) std::cout< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + LatticeColourMatrix z(&Grid); random(pRNG,z); + LatticeColourMatrix x(&Grid); random(pRNG,x); + LatticeColourMatrix y(&Grid); random(pRNG,y); + LatticeColourMatrix tmp(&Grid); + + for(int mu=0;mu<4;mu++){ + double tshift=0; + double tmult =0; + double start=usecond(); for(int64_t i=0;i &rhs,commVector &buffer,int dimen int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int e1=rhs._grid->_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; + int ent = 0; + + static std::vector > table; table.resize(e1*e2); int stride=rhs._grid->_slice_stride[dimension]; if ( cbmask == 0x3 ) { - parallel_for_nest2(int n=0;n(off+bo+b,so+o+b); } } } else { int bo=0; - std::vector > table; for(int n=0;nCheckerBoardFromOindex(o+b); if ( ocb &cbmask ) { - table.push_back(std::pair (bo++,o+b)); + table[ent++]=std::pair (off+bo++,so+o+b); } } } - parallel_for(int i=0;i void Scatter_plane_simple (Lattice &rhs,commVector_slice_nblock[dimension]; int e2=rhs._grid->_slice_block[dimension]; int stride=rhs._grid->_slice_stride[dimension]; - + + static std::vector > table; table.resize(e1*e2); + int ent =0; + if ( cbmask ==0x3 ) { - parallel_for_nest2(int n=0;n_slice_stride[dimension]; int bo =n*rhs._grid->_slice_block[dimension]; - rhs._odata[so+o+b]=buffer[bo+b]; + table[ent++] = std::pair(so+o+b,bo); } } + } else { - std::vector > table; int bo=0; for(int n=0;n_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb & cbmask ) { - table.push_back(std::pair (so+o+b,bo++)); + table[ent++]=std::pair (so+o+b,bo++); } } } - parallel_for(int i=0;i void Copy_plane(Lattice& lhs,const Lattice &rhs int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc int e2=rhs._grid->_slice_block[dimension]; int stride = rhs._grid->_slice_stride[dimension]; + static std::vector > table; table.resize(e1*e2); + int ent=0; + if(cbmask == 0x3 ){ - parallel_for_nest2(int n=0;n(lo+o,ro+o); } } } else { - parallel_for_nest2(int n=0;nCheckerBoardFromOindex(o); if ( ocb&cbmask ) { - //lhs._odata[lo+o]=rhs._odata[ro+o]; - vstream(lhs._odata[lo+o],rhs._odata[ro+o]); + table[ent++] = std::pair(lo+o,ro+o); } } } } - + + parallel_for(int i=0;i void Copy_plane_permute(Lattice& lhs,const Lattice &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) @@ -269,16 +278,28 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice_slice_block [dimension]; int stride = rhs._grid->_slice_stride[dimension]; - parallel_for_nest2(int n=0;n > table; table.resize(e1*e2); + int ent=0; + double t_tab,t_perm; + if ( cbmask == 0x3 ) { + for(int n=0;n(lo+o+b,ro+o+b); + }} + } else { + for(int n=0;nCheckerBoardFromOindex(o+b); - if ( ocb&cbmask ) { - permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type); - } + if ( ocb&cbmask ) table[ent++] = std::pair(lo+o+b,ro+o+b); + }} + } - }} + parallel_for(int i=0;i void Cshift_local(Lattice& ret,const Lattice &r sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); + double t_local; + if ( sshift[0] == sshift[1] ) { Cshift_local(ret,rhs,dimension,shift,0x3); } else { @@ -299,7 +322,7 @@ template void Cshift_local(Lattice& ret,const Lattice &r } } -template Lattice Cshift_local(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +template void Cshift_local(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { GridBase *grid = rhs._grid; int fd = grid->_fdimensions[dimension]; @@ -325,11 +348,7 @@ template Lattice Cshift_local(Lattice &ret,const Lattice int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sx = (x+sshift)%rd; - - // FIXME : This must change where we have a - // Rotate slice. - // Document how this works ; why didn't I do this when I first wrote it... // wrap is whether sshift > rd. // num is sshift mod rd. // @@ -365,10 +384,8 @@ template Lattice Cshift_local(Lattice &ret,const Lattice if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist); else Copy_plane(ret,rhs,dimension,x,sx,cbmask); - } - return ret; } } #endif diff --git a/lib/lattice/Lattice_base.h b/lib/lattice/Lattice_base.h index dcd55702..1169d18f 100644 --- a/lib/lattice/Lattice_base.h +++ b/lib/lattice/Lattice_base.h @@ -256,13 +256,42 @@ public: _odata[ss]=r._odata[ss]; } } - + Lattice(Lattice&& r){ // move constructor _grid = r._grid; checkerboard = r.checkerboard; _odata=std::move(r._odata); } + inline Lattice & operator = (Lattice && r) + { + _grid = r._grid; + checkerboard = r.checkerboard; + _odata =std::move(r._odata); + return *this; + } + + inline Lattice & operator = (const Lattice & r){ + _grid = r._grid; + checkerboard = r.checkerboard; + _odata.resize(_grid->oSites());// essential + + parallel_for(int ss=0;ss<_grid->oSites();ss++){ + _odata[ss]=r._odata[ss]; + } + return *this; + } + + template strong_inline Lattice & operator = (const Lattice & r){ + this->checkerboard = r.checkerboard; + conformable(*this,r); + + parallel_for(int ss=0;ss<_grid->oSites();ss++){ + this->_odata[ss]=r._odata[ss]; + } + return *this; + } + virtual ~Lattice(void) = default; void reset(GridBase* grid) { @@ -281,33 +310,6 @@ public: return *this; } - template strong_inline Lattice & operator = (const Lattice & r){ - this->checkerboard = r.checkerboard; - conformable(*this,r); - - parallel_for(int ss=0;ss<_grid->oSites();ss++){ - this->_odata[ss]=r._odata[ss]; - } - return *this; - } - - strong_inline Lattice & operator = (const Lattice & r){ - _grid = r._grid; - checkerboard = r.checkerboard; - _odata.resize(_grid->oSites());// essential - - parallel_for(int ss=0;ss<_grid->oSites();ss++){ - _odata[ss]=r._odata[ss]; - } - return *this; - } - strong_inline Lattice & operator = (Lattice && r) - { - _grid = r._grid; - checkerboard = r.checkerboard; - _odata =std::move(r._odata); - return *this; - } // *=,+=,-= operators inherit behvour from correspond */+/- operation template strong_inline Lattice &operator *=(const T &r) { diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 45fd522e..ce84fc81 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -110,11 +110,11 @@ class BinaryIO { lsites = 1; } - #pragma omp parallel +PARALLEL_REGION { uint32_t nersc_csum_thr = 0; - #pragma omp for +PARALLEL_FOR_LOOP_INTERN for (uint64_t local_site = 0; local_site < lsites; local_site++) { uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; @@ -124,7 +124,7 @@ class BinaryIO { } } - #pragma omp critical +PARALLEL_CRITICAL { nersc_csum += nersc_csum_thr; } @@ -146,14 +146,14 @@ class BinaryIO { std::vector local_start =grid->LocalStarts(); std::vector global_vol =grid->FullDimensions(); -#pragma omp parallel +PARALLEL_REGION { std::vector coor(nd); uint32_t scidac_csuma_thr=0; uint32_t scidac_csumb_thr=0; uint32_t site_crc=0; -#pragma omp for +PARALLEL_FOR_LOOP_INTERN for(uint64_t local_site=0;local_site>(32-gsite31); } -#pragma omp critical +PARALLEL_CRITICAL { scidac_csuma^= scidac_csuma_thr; scidac_csumb^= scidac_csumb_thr; diff --git a/lib/qcd/hmc/integrators/Integrator.h b/lib/qcd/hmc/integrators/Integrator.h index 95b19904..232eba42 100644 --- a/lib/qcd/hmc/integrators/Integrator.h +++ b/lib/qcd/hmc/integrators/Integrator.h @@ -114,18 +114,26 @@ class Integrator { // input U actually not used in the fundamental case // Fundamental updates, include smearing - for (int a = 0; a < as[level].actions.size(); ++a) { + for (int a = 0; a < as[level].actions.size(); ++a) { + double start_full = usecond(); Field force(U._grid); conformable(U._grid, Mom._grid); + Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); + double start_force = usecond(); as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); force = FieldImplementation::projectForce(force); // Ta for gauge fields + double end_force = usecond(); Real force_abs = std::sqrt(norm2(force)/U._grid->gSites()); - std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl; + std::cout << GridLogIntegrator << "["< -class NoSmearing { +//trivial class for no smearing +template +class NoSmearing +{ public: INHERIT_FIELD_TYPES(Impl); - Field* ThinField; + Field *ThinField; - NoSmearing(): ThinField(NULL) {} + NoSmearing() : ThinField(NULL) {} - void set_Field(Field& U) { ThinField = &U; } + void set_Field(Field &U) { ThinField = &U; } - void smeared_force(Field&) const {} + void smeared_force(Field &) const {} - Field& get_SmearedU() { return *ThinField; } + Field &get_SmearedU() { return *ThinField; } - Field& get_U(bool smeared = false) { + Field &get_U(bool smeared = false) + { return *ThinField; } - }; /*! @@ -44,32 +47,36 @@ public: It stores a list of smeared configurations. */ template -class SmearedConfiguration { - public: +class SmearedConfiguration +{ +public: INHERIT_GIMPL_TYPES(Gimpl); - private: +private: const unsigned int smearingLevels; Smear_Stout StoutSmearing; std::vector SmearedSet; // Member functions //==================================================================== - void fill_smearedSet(GaugeField& U) { - ThinLinks = &U; // attach the smearing routine to the field U + void fill_smearedSet(GaugeField &U) + { + ThinLinks = &U; // attach the smearing routine to the field U // check the pointer is not null if (ThinLinks == NULL) std::cout << GridLogError << "[SmearedConfiguration] Error in ThinLinks pointer\n"; - if (smearingLevels > 0) { + if (smearingLevels > 0) + { std::cout << GridLogDebug << "[SmearedConfiguration] Filling SmearedSet\n"; GaugeField previous_u(ThinLinks->_grid); previous_u = *ThinLinks; - for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) { + for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) + { StoutSmearing.smear(SmearedSet[smearLvl], previous_u); previous_u = SmearedSet[smearLvl]; @@ -81,9 +88,10 @@ class SmearedConfiguration { } } //==================================================================== - GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime, - const GaugeField& GaugeK) const { - GridBase* grid = GaugeK._grid; + GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime, + const GaugeField &GaugeK) const + { + GridBase *grid = GaugeK._grid; GaugeField C(grid), SigmaK(grid), iLambda(grid); GaugeLinkField iLambda_mu(grid); GaugeLinkField iQ(grid), e_iQ(grid); @@ -94,7 +102,8 @@ class SmearedConfiguration { SigmaK = zero; iLambda = zero; - for (int mu = 0; mu < Nd; mu++) { + for (int mu = 0; mu < Nd; mu++) + { Cmu = peekLorentz(C, mu); GaugeKmu = peekLorentz(GaugeK, mu); SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu); @@ -104,20 +113,22 @@ class SmearedConfiguration { pokeLorentz(iLambda, iLambda_mu, mu); } StoutSmearing.derivative(SigmaK, iLambda, - GaugeK); // derivative of SmearBase + GaugeK); // derivative of SmearBase return SigmaK; } /*! @brief Returns smeared configuration at level 'Level' */ - const GaugeField& get_smeared_conf(int Level) const { + const GaugeField &get_smeared_conf(int Level) const + { return SmearedSet[Level]; } //==================================================================== - void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ, - const GaugeLinkField& iQ, const GaugeLinkField& Sigmap, - const GaugeLinkField& GaugeK) const { - GridBase* grid = iQ._grid; + void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ, + const GaugeLinkField &iQ, const GaugeLinkField &Sigmap, + const GaugeLinkField &GaugeK) const + { + GridBase *grid = iQ._grid; GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid); GaugeLinkField unity(grid); unity = 1.0; @@ -206,15 +217,15 @@ class SmearedConfiguration { } //==================================================================== - public: - GaugeField* - ThinLinks; /*!< @brief Pointer to the thin - links configuration */ +public: + GaugeField * + ThinLinks; /* Pointer to the thin links configuration */ - /*! @brief Standard constructor */ - SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear, - Smear_Stout& Stout) - : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) { + /* Standard constructor */ + SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear, + Smear_Stout &Stout) + : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) + { for (unsigned int i = 0; i < smearingLevels; ++i) SmearedSet.push_back(*(new GaugeField(UGrid))); } @@ -223,21 +234,29 @@ class SmearedConfiguration { SmearedConfiguration() : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} - - // attach the smeared routines to the thin links U and fill the smeared set - void set_Field(GaugeField& U) { fill_smearedSet(U); } + void set_Field(GaugeField &U) + { + double start = usecond(); + fill_smearedSet(U); + double end = usecond(); + double time = (end - start)/ 1e3; + std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl; + } //==================================================================== - void smeared_force(GaugeField& SigmaTilde) const { - if (smearingLevels > 0) { + void smeared_force(GaugeField &SigmaTilde) const + { + if (smearingLevels > 0) + { + double start = usecond(); GaugeField force = SigmaTilde; // actually = U*SigmaTilde GaugeLinkField tmp_mu(SigmaTilde._grid); - for (int mu = 0; mu < Nd; mu++) { + for (int mu = 0; mu < Nd; mu++) + { // to get just SigmaTilde - tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * - peekLorentz(force, mu); + tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu); pokeLorentz(force, tmp_mu, mu); } @@ -246,33 +265,43 @@ class SmearedConfiguration { force = AnalyticSmearedForce(force, *ThinLinks); - for (int mu = 0; mu < Nd; mu++) { + for (int mu = 0; mu < Nd; mu++) + { tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu); pokeLorentz(SigmaTilde, tmp_mu, mu); } - } // if smearingLevels = 0 do nothing + double end = usecond(); + double time = (end - start)/ 1e3; + std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl; + } // if smearingLevels = 0 do nothing } //==================================================================== - GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; } + GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; } - GaugeField& get_U(bool smeared = false) { + GaugeField &get_U(bool smeared = false) + { // get the config, thin links by default - if (smeared) { - if (smearingLevels) { + if (smeared) + { + if (smearingLevels) + { RealD impl_plaq = WilsonLoops::avgPlaquette(SmearedSet[smearingLevels - 1]); std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq << std::endl; return get_SmearedU(); - - } else { + } + else + { RealD impl_plaq = WilsonLoops::avgPlaquette(*ThinLinks); std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq << std::endl; return *ThinLinks; } - } else { + } + else + { RealD impl_plaq = WilsonLoops::avgPlaquette(*ThinLinks); std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq << std::endl; diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index a53d8921..92567b6f 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -49,6 +49,8 @@ int main (int argc, char ** argv) const int Ls=8; + std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -90,24 +92,23 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); - std::cout << "Starting mixed CG" << std::endl; + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; MixedPrecisionConjugateGradient mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); mCG(src_o,result_o); - std::cout << "Starting regular CG" << std::endl; + std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl; ConjugateGradient CG(1.0e-8,10000); CG(HermOpEO,src_o,result_o_2); LatticeFermionD diff_o(FrbGrid); RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2); - std::cout << "Diff between mixed and regular CG: " << diff << std::endl; + std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl; #ifdef HAVE_LIME if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){ std::string file1("./Propagator1"); - std::string file2("./Propagator2"); emptyUserRecord record; uint32_t nersc_csum; uint32_t scidac_csuma; @@ -121,12 +122,12 @@ int main (int argc, char ** argv) BinaryIO::writeLatticeObject(result_o,file1,munge, 0, format, nersc_csum,scidac_csuma,scidac_csumb); - std::cout << " Mixed checksums "<(result_o_2,file1,munge, 0, format, nersc_csum,scidac_csuma,scidac_csumb); - std::cout << " CG checksums "<