From 037bb6ea7397fcedddcd94515786849351e4d507 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 16 Mar 2020 14:07:52 +0100 Subject: [PATCH 01/51] Check in reader for openqcd configs This reader is suboptimal in the sense that it opens the entire config on every MPI rank. --- Grid/parallelIO/OpenQcdIO.h | 153 +++++++++++++++++++++++++++++++++++ Grid/qcd/hmc/HMC_aggregate.h | 1 + tests/IO/Test_openqcd_io.cc | 55 +++++++++++++ 3 files changed, 209 insertions(+) create mode 100644 Grid/parallelIO/OpenQcdIO.h create mode 100644 tests/IO/Test_openqcd_io.cc diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h new file mode 100644 index 00000000..f340e8fc --- /dev/null +++ b/Grid/parallelIO/OpenQcdIO.h @@ -0,0 +1,153 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/parallelIO/OpenQcdIO.h + +Copyright (C) 2015 - 2020 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +NAMESPACE_BEGIN(Grid); + +struct OpenQcdHeader : Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader, + int, Nt, + int, Nx, + int, Ny, + int, Nz, + double, plaq); +}; + +class OpenQcdIO : public BinaryIO { +public: + static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6 + + static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) { + OpenQcdHeader header; + + { + std::ifstream fin(file, std::ios::in | std::ios::binary); + fin.read(reinterpret_cast(&header), sizeof(OpenQcdHeader)); + assert(!fin.fail()); + field.data_start = fin.tellg(); + fin.close(); + } + + header.plaq /= normalisationFactor; + + // sanity check (should trigger on endian issues) + assert(0 < header.Nt && header.Nt <= 1024); + assert(0 < header.Nx && header.Nx <= 1024); + assert(0 < header.Ny && header.Ny <= 1024); + assert(0 < header.Nz && header.Nz <= 1024); + + field.dimension[0] = header.Nx; + field.dimension[1] = header.Ny; + field.dimension[2] = header.Nz; + field.dimension[3] = header.Nt; + + assert(grid->_ndimension == Nd); + for(int d = 0; d < Nd; d++) + assert(grid->_fdimensions[d] == field.dimension[d]); + + field.plaquette = header.plaq; + + return field.data_start; + } + + template + static inline void readConfiguration(Lattice>& Umu, + FieldMetaData& header, + std::string file) { + auto grid = dynamic_cast(Umu.Grid()); + assert(grid != nullptr); + assert((grid->_ndimension == Nd) && (Nd == 4)); + + uint64_t offset = readHeader(file, Umu.Grid(), header); + FieldMetaData clone(header); + + // NOTE: This version is suboptimal because we read in the full file on every rank + std::vector data(grid->gSites() * 4); + { + auto fin = std::fstream(file, std::ios::in | std::ios::binary); + fin.seekg(offset); + fin.read((char *)data.data(), data.size() * sizeof(ColourMatrix)); + fin.close(); + } + + // global lattice size + Coordinate fdim = grid->FullDimensions(); + + // coordinate of this process + Coordinate pcoor; + grid->ProcessorCoorFromRank(CartesianCommunicator::RankWorld(), pcoor); + + // loop over local indices + thread_for(idx, grid->lSites(), { + // convert local index to global coordinate + Coordinate lcoor, gcoor; + grid->LocalIndexToLocalCoor(idx, lcoor); + grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor, lcoor, gcoor); + + // openQCD stores links attached to odd sites + bool neg = (gcoor[Xdir] + gcoor[Ydir] + gcoor[Zdir] + gcoor[Tdir]) % 2 != 1; + + LorentzColourMatrix site_data; + for (int mu = 0; mu < 4; ++mu) { + // determine the site at which it is stored + Coordinate c = gcoor; + if (neg) + c[mu] = (c[mu] + 1) % grid->FullDimensions()[mu]; + + // site-index in the OpenQCD format (which uses t,x,y,z order) + int openqcd_idx = (c[Tdir] * fdim[Xdir] * fdim[Ydir] * fdim[Zdir] + + c[Xdir] * fdim[Ydir] * fdim[Zdir] + + c[Ydir] * fdim[Zdir] + + c[Zdir])/2; + int openqcd_mu = (mu + 1) % 4; + + // pick the colour-matrix out + site_data(mu) = + data[8 * openqcd_idx + 2 * openqcd_mu + (neg ? 1 : 0)](); + } + + pokeLocalSite(site_data, Umu, lcoor); + }); + + GaugeStatistics(Umu, clone); + + std::cout << GridLogMessage << "OpenQcd Configuration " << file << " plaquette " + << std::setprecision(15) + << clone.plaquette << " header " << header.plaquette + << " difference " << fabs(clone.plaquette - header.plaquette) + << std::endl; + + if(fabs(clone.plaquette - header.plaquette) >= 1.0e-5) std::cout << " Plaquette mismatch " << std::endl; + assert(fabs(clone.plaquette - header.plaquette) < 1.0e-5); + + std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl; + } +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h index e4d2ce83..94c745e1 100644 --- a/Grid/qcd/hmc/HMC_aggregate.h +++ b/Grid/qcd/hmc/HMC_aggregate.h @@ -39,6 +39,7 @@ directory #include #include #include +#include NAMESPACE_CHECK(Ildg); #include diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc new file mode 100644 index 00000000..2a5769bd --- /dev/null +++ b/tests/IO/Test_openqcd_io.cc @@ -0,0 +1,55 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/io/Test_openqcd_io.cc + +Copyright (C) 2015 - 2020 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + GridCartesian* grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), + GridDefaultSimd(Nd, vComplexD::Nsimd()), + GridDefaultMpi()); + + LatticeGaugeField Umu(grid); + + FieldMetaData header; + + if(!Grid::GridCmdOptionExists(argv, argv + argc, "--config")) { + std::cout << GridLogError << "You need to use --config /path/to/openqcd_config" << std::endl; + abort(); + } + + std::string file = Grid::GridCmdOptionPayload(argv, argv + argc, "--config"); + assert(!file.empty()); + + OpenQcdIO::readConfiguration(Umu, header, file); + + Grid_finalize(); +} From c9b737a4e7bb62cf6d22dace3458b108043e5023 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 16 Mar 2020 17:58:30 -0400 Subject: [PATCH 02/51] make trace,adj,transpose unary operators --- Grid/lattice/Lattice.h | 4 ++-- Grid/lattice/Lattice_trace.h | 2 ++ .../WilsonCloverFermionImplementation.h | 8 ++++---- Grid/qcd/action/gauge/GaugeImplementations.h | 2 +- Grid/qcd/utils/WilsonLoops.h | 4 ++-- bootstrap.sh | 15 ++++++++------- 6 files changed, 19 insertions(+), 16 deletions(-) diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 1eea98ed..6eac0362 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -31,11 +31,11 @@ Author: Peter Boyle #include #include #include -#include +//#include #include #include #include -#include +//#include #include #include #include diff --git a/Grid/lattice/Lattice_trace.h b/Grid/lattice/Lattice_trace.h index 93444e0c..6b5f67d2 100644 --- a/Grid/lattice/Lattice_trace.h +++ b/Grid/lattice/Lattice_trace.h @@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid); //////////////////////////////////////////////////////////////////////////////////////////////////// // Trace //////////////////////////////////////////////////////////////////////////////////////////////////// +/* template inline auto trace(const Lattice &lhs) -> Lattice { @@ -48,6 +49,7 @@ inline auto trace(const Lattice &lhs) -> Lattice }); return ret; }; +*/ //////////////////////////////////////////////////////////////////////////////////////////////////// // Trace Index level dependent operation diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 5744d3bb..9d99d9e7 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -132,14 +132,14 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm); - pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); - pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); + pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm))); + pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm))); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); - pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); - pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); + pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv))); + pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv))); } template diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index a14aec1b..19bc5aa6 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -59,7 +59,7 @@ public: } static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - return Cshift(adj(Link), mu, -1); + return Cshift(closure(adj(Link)), mu, -1); } static inline GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 0367c9fa..fdd53698 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -485,7 +485,7 @@ public: // Up staple ___ ___ // | | - tmp = Cshift(adj(U[nu]), nu, -1); + tmp = Cshift(closure(adj(U[nu])), nu, -1); tmp = adj(U2[mu]) * tmp; tmp = Cshift(tmp, mu, -2); @@ -519,7 +519,7 @@ public: // // | | - tmp = Cshift(adj(U2[nu]), nu, -2); + tmp = Cshift(closure(adj(U2[nu])), nu, -2); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); tmp = U2[nu] * Cshift(tmp, nu, 2); Stap += Cshift(tmp, mu, 1); diff --git a/bootstrap.sh b/bootstrap.sh index 4bd3de5e..5ea7ce32 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,19 +1,20 @@ #!/usr/bin/env bash set -e -EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2' +#https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2 +EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.7.tar.bz2' EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11' echo "-- deploying Eigen source..." ARC=`basename ${EIGEN_URL}` wget ${EIGEN_URL} --no-check-certificate -if command -v sha256sum; then - echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \ - | sha256sum --check || exit 1 -else - echo "WARNING: could not verify checksum, please install sha256sum" >&2 -fi +#if command -v sha256sum; then +# echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \ +# | sha256sum --check || exit 1 +#else +# echo "WARNING: could not verify checksum, please install sha256sum" >&2 +#fi ./scripts/update_eigen.sh ${ARC} rm ${ARC} # patch for non-portable includes in Eigen 3.3.5 From 60db3133d38e8e4433627e6351e38cafe210c175 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 16 Mar 2020 17:59:56 -0400 Subject: [PATCH 03/51] make trace,adj,transpose unary operators --- bootstrap.sh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bootstrap.sh b/bootstrap.sh index 5ea7ce32..4bd3de5e 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,20 +1,19 @@ #!/usr/bin/env bash set -e -#https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2 -EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.3.7.tar.bz2' +EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2' EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11' echo "-- deploying Eigen source..." ARC=`basename ${EIGEN_URL}` wget ${EIGEN_URL} --no-check-certificate -#if command -v sha256sum; then -# echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \ -# | sha256sum --check || exit 1 -#else -# echo "WARNING: could not verify checksum, please install sha256sum" >&2 -#fi +if command -v sha256sum; then + echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \ + | sha256sum --check || exit 1 +else + echo "WARNING: could not verify checksum, please install sha256sum" >&2 +fi ./scripts/update_eigen.sh ${ARC} rm ${ARC} # patch for non-portable includes in Eigen 3.3.5 From 989af658071f5d9fc92adc0d6e0ab9775b3e0e51 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 23 Mar 2020 17:33:18 +0100 Subject: [PATCH 04/51] Check in parallel reader for openqcd configs --- Grid/parallelIO/MetaData.h | 24 ++ Grid/parallelIO/OpenQcdIO.h | 165 ++++++++---- Grid/parallelIO/OpenQcdIOChromaReference.h | 281 +++++++++++++++++++++ Grid/qcd/hmc/HMC_aggregate.h | 3 + tests/IO/Test_openqcd_io.cc | 51 +++- 5 files changed, 466 insertions(+), 58 deletions(-) create mode 100644 Grid/parallelIO/OpenQcdIOChromaReference.h diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h index 2e211838..4c1cfbdb 100644 --- a/Grid/parallelIO/MetaData.h +++ b/Grid/parallelIO/MetaData.h @@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger { }; }; +template +struct GaugeDoubleStoredMunger{ + void operator()(fobj &in, sobj &out) { + for (int mu = 0; mu < Nds; mu++) { + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} + } + }; +}; + +template +struct GaugeDoubleStoredUnmunger { + void operator()(sobj &in, fobj &out) { + for (int mu = 0; mu < Nds; mu++) { + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { + out(mu)()(i, j) = in(mu)()(i, j); + }} + } + }; +}; + template struct Gauge3x2munger{ void operator() (fobj &in,sobj &out){ diff --git a/Grid/parallelIO/OpenQcdIO.h b/Grid/parallelIO/OpenQcdIO.h index f340e8fc..00911595 100644 --- a/Grid/parallelIO/OpenQcdIO.h +++ b/Grid/parallelIO/OpenQcdIO.h @@ -67,6 +67,10 @@ public: field.dimension[2] = header.Nz; field.dimension[3] = header.Nt; + std::cout << GridLogDebug << "header: " << header << std::endl; + std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl; + std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl; + assert(grid->_ndimension == Nd); for(int d = 0; d < Nd; d++) assert(grid->_fdimensions[d] == field.dimension[d]); @@ -80,74 +84,141 @@ public: static inline void readConfiguration(Lattice>& Umu, FieldMetaData& header, std::string file) { + typedef Lattice> DoubleStoredGaugeField; + + assert(Ns == 4 and Nd == 4 and Nc == 3); + auto grid = dynamic_cast(Umu.Grid()); - assert(grid != nullptr); - assert((grid->_ndimension == Nd) && (Nd == 4)); + assert(grid != nullptr); assert(grid->_ndimension == Nd); uint64_t offset = readHeader(file, Umu.Grid(), header); + FieldMetaData clone(header); - // NOTE: This version is suboptimal because we read in the full file on every rank - std::vector data(grid->gSites() * 4); - { - auto fin = std::fstream(file, std::ios::in | std::ios::binary); - fin.seekg(offset); - fin.read((char *)data.data(), data.size() * sizeof(ColourMatrix)); - fin.close(); - } + std::string format("IEEE64"); // they always store little endian double precsision + uint32_t nersc_csum, scidac_csuma, scidac_csumb; - // global lattice size - Coordinate fdim = grid->FullDimensions(); + GridCartesian* grid_openqcd = createOpenQcdGrid(grid); + GridRedBlackCartesian* grid_rb = SpaceTimeGrid::makeFourDimRedBlackGrid(grid); - // coordinate of this process - Coordinate pcoor; - grid->ProcessorCoorFromRank(CartesianCommunicator::RankWorld(), pcoor); + typedef DoubleStoredColourMatrixD fobj; + typedef typename DoubleStoredGaugeField::vector_object::scalar_object sobj; + typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word; - // loop over local indices - thread_for(idx, grid->lSites(), { - // convert local index to global coordinate - Coordinate lcoor, gcoor; - grid->LocalIndexToLocalCoor(idx, lcoor); - grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor, lcoor, gcoor); + word w = 0; - // openQCD stores links attached to odd sites - bool neg = (gcoor[Xdir] + gcoor[Ydir] + gcoor[Zdir] + gcoor[Tdir]) % 2 != 1; + std::vector iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here + std::vector scalardata(grid->lSites()); - LorentzColourMatrix site_data; - for (int mu = 0; mu < 4; ++mu) { - // determine the site at which it is stored - Coordinate c = gcoor; - if (neg) - c[mu] = (c[mu] + 1) % grid->FullDimensions()[mu]; + IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC, + nersc_csum, scidac_csuma, scidac_csumb); - // site-index in the OpenQCD format (which uses t,x,y,z order) - int openqcd_idx = (c[Tdir] * fdim[Xdir] * fdim[Ydir] * fdim[Zdir] - + c[Xdir] * fdim[Ydir] * fdim[Zdir] - + c[Ydir] * fdim[Zdir] - + c[Zdir])/2; - int openqcd_mu = (mu + 1) % 4; + GridStopWatch timer; + timer.Start(); - // pick the colour-matrix out - site_data(mu) = - data[8 * openqcd_idx + 2 * openqcd_mu + (neg ? 1 : 0)](); - } + DoubleStoredGaugeField Umu_ds(grid); - pokeLocalSite(site_data, Umu, lcoor); + auto munge = GaugeDoubleStoredMunger(); + + Coordinate ldim = grid->LocalDimensions(); + thread_for(idx_g, grid->lSites(), { + Coordinate coor; + grid->LocalIndexToLocalCoor(idx_g, coor); + + bool isOdd = grid_rb->CheckerBoard(coor) == Odd; + + if(!isOdd) continue; + + int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir] + + coor[Xdir] * ldim[Ydir] * ldim[Zdir] + + coor[Ydir] * ldim[Zdir] + + coor[Zdir])/2; + + munge(iodata[idx_o], scalardata[idx_g]); }); + grid->Barrier(); timer.Stop(); + std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl; + + timer.Reset(); timer.Start(); + + vectorizeFromLexOrdArray(scalardata, Umu_ds); + + grid->Barrier(); timer.Stop(); + std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl; + + timer.Reset(); timer.Start(); + + undoDoubleStore(Umu, Umu_ds); + + grid->Barrier(); timer.Stop(); + std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; + GaugeStatistics(Umu, clone); - std::cout << GridLogMessage << "OpenQcd Configuration " << file << " plaquette " - << std::setprecision(15) - << clone.plaquette << " header " << header.plaquette - << " difference " << fabs(clone.plaquette - header.plaquette) - << std::endl; + RealD plaq_diff = fabs(clone.plaquette - header.plaquette); - if(fabs(clone.plaquette - header.plaquette) >= 1.0e-5) std::cout << " Plaquette mismatch " << std::endl; - assert(fabs(clone.plaquette - header.plaquette) < 1.0e-5); + // clang-format off + std::cout << GridLogMessage << "OpenQcd Configuration " << file + << " plaquette " << clone.plaquette + << " header " << header.plaquette + << " difference " << plaq_diff + << std::endl; + // clang-format on + + RealD precTol = (getPrecision::value == 1) ? 2e-7 : 2e-15; + RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code + + if(plaq_diff >= tol) + std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl; + assert(plaq_diff < tol); std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl; } + + template + static inline void writeConfiguration(Lattice>& Umu, + std::string file) { + std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl; + exit(EXIT_FAILURE); + } + +private: + static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) { + // exploit GridCartesian to be able to still use IOobject + Coordinate gdim = grid->GlobalDimensions(); + Coordinate ldim = grid->LocalDimensions(); + Coordinate pcoor = grid->ThisProcessorCoor(); + + // openqcd does rb on the z direction + gdim[Zdir] /= 2; + ldim[Zdir] /= 2; + + // and has the order T X Y Z (from slowest to fastest) + std::swap(gdim[Xdir], gdim[Zdir]); + std::swap(ldim[Xdir], ldim[Zdir]); + std::swap(pcoor[Xdir], pcoor[Zdir]); + + GridCartesian* ret = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid()); + ret->_ldimensions = ldim; + ret->_processor_coor = pcoor; + return ret; + } + + template + static inline void undoDoubleStore(Lattice>& Umu, + Lattice> const& Umu_ds) { + conformable(Umu.Grid(), Umu_ds.Grid()); + Lattice> U(Umu.Grid()); + + // they store T+, T-, X+, X-, Y+, Y-, Z+, Z- + for(int mu_g = 0; mu_g < Nd; ++mu_g) { + int mu_o = (mu_g + 1) % Nd; + U = PeekIndex(Umu_ds, 2 * mu_o) + + Cshift(PeekIndex(Umu_ds, 2 * mu_o + 1), mu_g, +1); + PokeIndex(Umu, U, mu_g); + } + } }; NAMESPACE_END(Grid); diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h new file mode 100644 index 00000000..bab54fe8 --- /dev/null +++ b/Grid/parallelIO/OpenQcdIOChromaReference.h @@ -0,0 +1,281 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h + +Copyright (C) 2015 - 2020 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;} +#define CHECK_VAR(a) { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; } +// #undef CHECK +// #define CHECK + +NAMESPACE_BEGIN(Grid); + +class ParRdr { +private: + bool const swap; + + MPI_Status status; + MPI_File fp; + + int err; + + MPI_Datatype oddSiteType; + MPI_Datatype fileViewType; + + GridBase* grid; + +public: + ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr) + : swap(false) + , grid(gridPtr) { + err = MPI_File_open(comm, const_cast(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp); + assert(err == MPI_SUCCESS); + } + + virtual ~ParRdr() { MPI_File_close(&fp); } + + inline void errInfo(int const err, std::string const& func) { + static char estring[MPI_MAX_ERROR_STRING]; + int eclass = -1, len = 0; + MPI_Error_class(err, &eclass); + MPI_Error_string(err, estring, &len); + std::cerr << func << " - Error " << eclass << ": " << estring << std::endl; + } + + int readHeader(FieldMetaData& field) { + assert((grid->_ndimension == Nd) && (Nd == 4)); + assert(Nc == 3); + + OpenQcdHeader header; + + readBlock(reinterpret_cast(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR); + + header.plaq /= 3.; // TODO change this into normalizationfactor + + // sanity check (should trigger on endian issues) TODO remove? + assert(0 < header.Nt && header.Nt <= 1024); + assert(0 < header.Nx && header.Nx <= 1024); + assert(0 < header.Ny && header.Ny <= 1024); + assert(0 < header.Nz && header.Nz <= 1024); + + field.dimension[0] = header.Nx; + field.dimension[1] = header.Ny; + field.dimension[2] = header.Nz; + field.dimension[3] = header.Nt; + + for(int d = 0; d < Nd; d++) + assert(grid->FullDimensions()[d] == field.dimension[d]); + + field.plaquette = header.plaq; + + field.data_start = sizeof(OpenQcdHeader); + + return field.data_start; + } + + void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) { + err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status); + errInfo(err, "MPI_File_read_at_all"); + // CHECK_VAR(err) + + int read = -1; + MPI_Get_count(&status, datatype, &read); + // CHECK_VAR(read) + assert(nbytes == (uint64_t)read); + assert(err == MPI_SUCCESS); + } + + void createTypes() { + constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd + + err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS); + err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS); + + Coordinate const L = grid->GlobalDimensions(); + Coordinate const l = grid->LocalDimensions(); + Coordinate const i = grid->ThisProcessorCoor(); + + Coordinate sizes({L[2] / 2, L[1], L[0], L[3]}); + Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]}); + Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]}); + + err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS); + err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS); + } + + void freeTypes() { + err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS); + err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS); + } + + bool readGauge(std::vector& domain_buff, FieldMetaData& meta) { + auto hdr_offset = readHeader(meta); + CHECK + createTypes(); + err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS); + CHECK + int const domainSites = grid->lSites(); + domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd + + // the actual READ + constexpr uint64_t cm_size = 2 * Nc * Nc * sizeof(double); // 2_complex + constexpr uint64_t os_size = Nd * 2 * cm_size; // 2_fwdbwd + constexpr uint64_t max_elems = std::numeric_limits::max(); // int adressable elems: floor is fine + uint64_t const n_os = domainSites / 2; + + for(uint64_t os_idx = 0; os_idx < n_os;) { + uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx; + uint64_t const cm = os_idx * Nd * 2; + readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType); + os_idx += read_os; + } + + CHECK + err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + errInfo(err, "MPI_File_set_view1"); + assert(err == MPI_SUCCESS); + freeTypes(); + + std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl; + return true; + } +}; + +class OpenQcdIOChromaReference : public BinaryIO { +public: + template + static inline void readConfiguration(Lattice>& Umu, + Grid::FieldMetaData& header, + std::string file) { + typedef Lattice> DoubledGaugeField; + + assert(Ns == 4 and Nd == 4 and Nc == 3); + + auto grid = Umu.Grid(); + + typedef ColourMatrixD fobj; + + std::vector iodata( + Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file + + { + ParRdr rdr(MPI_COMM_WORLD, file, grid); + rdr.readGauge(iodata, header); + } // equivalent to using binaryio + + std::vector> Umu_ds_scalar(grid->lSites()); + + copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging + + DoubledGaugeField Umu_ds(grid); + + vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds); + + redistribute(Umu, Umu_ds); // equivalent to undoDoublestore + + FieldMetaData clone(header); + + GaugeStatistics(Umu, clone); + + RealD plaq_diff = fabs(clone.plaquette - header.plaquette); + + // clang-format off + std::cout << GridLogMessage << "OpenQcd Configuration " << file + << " plaquette " << clone.plaquette + << " header " << header.plaquette + << " difference " << plaq_diff + << std::endl; + // clang-format on + + RealD precTol = (getPrecision::value == 1) ? 2e-7 : 2e-15; + RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code + + if(plaq_diff >= tol) + std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl; + assert(plaq_diff < tol); + + std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl; + } + +private: + template + static inline void redistribute(Lattice>& Umu, + Lattice> const& Umu_ds) { + Grid::conformable(Umu.Grid(), Umu_ds.Grid()); + Lattice> U(Umu.Grid()); + + U = PeekIndex(Umu_ds, 2) + Cshift(PeekIndex(Umu_ds, 3), 0, +1); PokeIndex(Umu, U, 0); + U = PeekIndex(Umu_ds, 4) + Cshift(PeekIndex(Umu_ds, 5), 1, +1); PokeIndex(Umu, U, 1); + U = PeekIndex(Umu_ds, 6) + Cshift(PeekIndex(Umu_ds, 7), 2, +1); PokeIndex(Umu, U, 2); + U = PeekIndex(Umu_ds, 0) + Cshift(PeekIndex(Umu_ds, 1), 3, +1); PokeIndex(Umu, U, 3); + } + + static inline void copyToLatticeObject(std::vector& u_fb, + std::vector const& node_buff, + GridBase* grid) { + assert(node_buff.size() == Nd * grid->lSites()); + + Coordinate const& l = grid->LocalDimensions(); + + Coordinate coord(Nd); + int& x = coord[0]; + int& y = coord[1]; + int& z = coord[2]; + int& t = coord[3]; + + int buff_idx = 0; + for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering + for(x = 0; x < l[0]; ++x) + for(y = 0; y < l[1]; ++y) + for(z = 0; z < l[2]; ++z) { + if((t + z + y + x) % 2 == 0) continue; + + int local_idx; + Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions()); + for(int mu = 0; mu < 2 * Nd; ++mu) + for(int c1 = 0; c1 < Nc; ++c1) { + for(int c2 = 0; c2 < Nc; ++c2) { + u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2); + } + } + buff_idx += 2 * Nd; + } + + assert(node_buff.size() == buff_idx); + } +}; + +NAMESPACE_END(Grid); diff --git a/Grid/qcd/hmc/HMC_aggregate.h b/Grid/qcd/hmc/HMC_aggregate.h index 94c745e1..cb510953 100644 --- a/Grid/qcd/hmc/HMC_aggregate.h +++ b/Grid/qcd/hmc/HMC_aggregate.h @@ -40,6 +40,9 @@ directory #include #include #include +#if !defined(GRID_COMMS_NONE) +#include +#endif NAMESPACE_CHECK(Ildg); #include diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc index 2a5769bd..83b498c2 100644 --- a/tests/IO/Test_openqcd_io.cc +++ b/tests/IO/Test_openqcd_io.cc @@ -28,28 +28,57 @@ See the full license in the file "LICENSE" in the top level distribution directo #include +#if defined(GRID_COMMS_NONE) +#error This test requires Grid compiled with MPI +#endif + using namespace Grid; int main(int argc, char** argv) { Grid_init(&argc, &argv); - GridCartesian* grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), - GridDefaultSimd(Nd, vComplexD::Nsimd()), - GridDefaultMpi()); + auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + auto latt_size = GridDefaultLatt(); - LatticeGaugeField Umu(grid); + GridCartesian grid(latt_size, simd_layout, mpi_layout); - FieldMetaData header; + GridParallelRNG pRNG(&grid); - if(!Grid::GridCmdOptionExists(argv, argv + argc, "--config")) { - std::cout << GridLogError << "You need to use --config /path/to/openqcd_config" << std::endl; - abort(); + pRNG.SeedFixedIntegers(std::vector({45, 12, 81, 9})); + + LatticeGaugeField Umu_ref(&grid); + LatticeGaugeField Umu_me(&grid); + LatticeGaugeField Umu_diff(&grid); + + FieldMetaData header_ref; + FieldMetaData header_me; + + Umu_ref = Zero(); + Umu_me = Zero(); + + std::string file("/home/daniel/configs/openqcd/test_16x8_pbcn6"); + + if(GridCmdOptionExists(argv, argv + argc, "--config")) { + file = GridCmdOptionPayload(argv, argv + argc, "--config"); + std::cout << "file: " << file << std::endl; + assert(!file.empty()); } - std::string file = Grid::GridCmdOptionPayload(argv, argv + argc, "--config"); - assert(!file.empty()); + OpenQcdIOChromaReference::readConfiguration(Umu_ref, header_ref, file); + OpenQcdIO::readConfiguration(Umu_me, header_me, file); - OpenQcdIO::readConfiguration(Umu, header, file); + std::cout << GridLogMessage << header_ref << std::endl; + std::cout << GridLogMessage << header_me << std::endl; + + Umu_diff = Umu_ref - Umu_me; + + // clang-format off + std::cout << GridLogMessage + << "norm2(Umu_ref) = " << norm2(Umu_ref) + << " norm2(Umu_me) = " << norm2(Umu_me) + << " norm2(Umu_diff) = " << norm2(Umu_diff) << std::endl; + // clang-format on Grid_finalize(); } From a2188ea875246e16d2081142a62e59f85961a6c4 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 26 Mar 2020 09:12:36 -0400 Subject: [PATCH 05/51] remove debugging printf from WilsonKernelsImplementation --- .../fermion/implementation/WilsonKernelsImplementation.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index a787fa79..5600d25a 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -372,19 +372,19 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} #ifndef GRID_NVCC if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} - if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} + if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} #endif } assert(0 && " Kernel optimisation case not covered "); From 856d168e4131b915ecab245753b44b690662c881 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 29 Mar 2020 07:56:05 -0400 Subject: [PATCH 06/51] global sum over vectors of uint64_t --- Grid/communicator/Communicator_base.h | 1 + Grid/communicator/Communicator_mpi3.cc | 4 ++++ Grid/communicator/Communicator_none.cc | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index 11dbfcbb..436d75ef 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -114,6 +114,7 @@ public: void GlobalSumVector(RealD *,int N); void GlobalSum(uint32_t &); void GlobalSum(uint64_t &); + void GlobalSumVector(uint64_t*,int N); void GlobalSum(ComplexF &c); void GlobalSumVector(ComplexF *c,int N); void GlobalSum(ComplexD &c); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 2576b1fa..0e525674 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -255,6 +255,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); assert(ierr==0); } +void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){ + int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator); + assert(ierr==0); +} void CartesianCommunicator::GlobalXOR(uint32_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); assert(ierr==0); diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index b8a15a0e..81900371 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){} void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSum(double &){} +void CartesianCommunicator::GlobalSumVector(double *,int N){} void CartesianCommunicator::GlobalSum(uint32_t &){} void CartesianCommunicator::GlobalSum(uint64_t &){} -void CartesianCommunicator::GlobalSumVector(double *,int N){} +void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){} void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){} From 5fc8a273e7e8f1a29acebad0237b5e0bef2ecb97 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 6 Apr 2020 11:30:50 +0200 Subject: [PATCH 07/51] Fused innerProduct + norm2 on first argument operation --- Grid/lattice/Lattice_reduction.h | 58 +++++++++++++- tests/Test_innerproduct_norm.cc | 126 +++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 tests/Test_innerproduct_norm.cc diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 3c5b03e5..de2efd72 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -204,8 +204,64 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt grid->GlobalSum(nrm); return nrm; } - +template strong_inline void +innerProduct_norm(ComplexD& ip, RealD &nrm, const Lattice &left,const Lattice &right) +{ + conformable(left,right); + + typedef typename vobj::scalar_type scalar_type; + typedef typename vobj::vector_typeD vector_type; + Vector tmp(2); + + GridBase *grid = left.Grid(); + + auto left_v=left.View(); + auto right_v=right.View(); + + const uint64_t nsimd = grid->Nsimd(); + const uint64_t sites = grid->oSites(); + +#ifdef GRID_NVCC + // GPU + typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; + typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t; + Vector inner_tmp(sites); + Vector norm_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + auto norm_tmp_v = &norm_tmp[0]; + + accelerator_for( ss, sites, nsimd,{ + auto left_tmp = left_v(ss); + coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); + coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp))); + }); + + tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites)); + tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites)); +#else + // CPU + typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t; + typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t; + Vector inner_tmp(sites); + Vector norm_tmp(sites); + auto inner_tmp_v = &inner_tmp[0]; + auto norm_tmp_v = &norm_tmp[0]; + + accelerator_for( ss, sites, nsimd,{ + auto left_tmp = left_v(ss); + inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss)); + norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp); + }); + // Already promoted to double + tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); + tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); +#endif + grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector + ip = tmp[0]; + nrm = real(tmp[1]); +} + template inline auto sum(const LatticeUnaryExpression & expr) ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object diff --git a/tests/Test_innerproduct_norm.cc b/tests/Test_innerproduct_norm.cc new file mode 100644 index 00000000..85c98521 --- /dev/null +++ b/tests/Test_innerproduct_norm.cc @@ -0,0 +1,126 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_innerproduct_norm.cc + +Copyright (C) 2015 + +Author: Daniel Richtmann + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +using namespace Grid; + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + const int nIter = 100; + + // clang-format off + GridCartesian *Grid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi()); + GridCartesian *Grid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi()); + // clang-format on + + GridParallelRNG pRNG_d(Grid_d); + GridParallelRNG pRNG_f(Grid_f); + + std::vector seeds_d({1, 2, 3, 4}); + std::vector seeds_f({5, 6, 7, 8}); + + pRNG_d.SeedFixedIntegers(seeds_d); + pRNG_f.SeedFixedIntegers(seeds_f); + + // clang-format off + LatticeFermionD x_d(Grid_d); random(pRNG_d, x_d); + LatticeFermionD y_d(Grid_d); random(pRNG_d, y_d); + LatticeFermionF x_f(Grid_f); random(pRNG_f, x_f); + LatticeFermionF y_f(Grid_f); random(pRNG_f, y_f); + // clang-format on + + GridStopWatch sw_ref; + GridStopWatch sw_res; + + { // double precision + ComplexD ip_d_ref, ip_d_res, diff_ip_d; + RealD norm2_d_ref, norm2_d_res, diff_norm2_d; + + sw_ref.Reset(); + sw_ref.Start(); + for(int i = 0; i < nIter; ++i) { + ip_d_ref = innerProduct(x_d, y_d); + norm2_d_ref = norm2(x_d); + } + sw_ref.Stop(); + + sw_res.Reset(); + sw_res.Start(); + for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_d_res, norm2_d_res, x_d, y_d); } + sw_res.Stop(); + + diff_ip_d = ip_d_ref - ip_d_res; + diff_norm2_d = norm2_d_ref - norm2_d_res; + + // clang-format off + std::cout << GridLogMessage << "Double: ip_ref = " << ip_d_ref << " ip_res = " << ip_d_res << " diff = " << diff_ip_d << std::endl; + std::cout << GridLogMessage << "Double: norm2_ref = " << norm2_d_ref << " norm2_res = " << norm2_d_res << " diff = " << diff_norm2_d << std::endl; + std::cout << GridLogMessage << "Double: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl; + // clang-format on + + assert(diff_ip_d == 0.); + assert(diff_norm2_d == 0.); + + std::cout << GridLogMessage << "Double: all checks passed" << std::endl; + } + + { // single precision + ComplexD ip_f_ref, ip_f_res, diff_ip_f; + RealD norm2_f_ref, norm2_f_res, diff_norm2_f; + + sw_ref.Reset(); + sw_ref.Start(); + for(int i = 0; i < nIter; ++i) { + ip_f_ref = innerProduct(x_f, y_f); + norm2_f_ref = norm2(x_f); + } + sw_ref.Stop(); + + sw_res.Reset(); + sw_res.Start(); + for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_f_res, norm2_f_res, x_f, y_f); } + sw_res.Stop(); + + diff_ip_f = ip_f_ref - ip_f_res; + diff_norm2_f = norm2_f_ref - norm2_f_res; + + // clang-format off + std::cout << GridLogMessage << "Single: ip_ref = " << ip_f_ref << " ip_res = " << ip_f_res << " diff = " << diff_ip_f << std::endl; + std::cout << GridLogMessage << "Single: norm2_ref = " << norm2_f_ref << " norm2_res = " << norm2_f_res << " diff = " << diff_norm2_f << std::endl; + std::cout << GridLogMessage << "Single: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl; + // clang-format on + + assert(diff_ip_f == 0.); + assert(diff_norm2_f == 0.); + + std::cout << GridLogMessage << "Single: all checks passed" << std::endl; + } + + Grid_finalize(); +} From 091d5c605ef041dbb95ba88e8a8b49c8a7b966af Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 17 Apr 2020 04:25:28 -0400 Subject: [PATCH 08/51] towards more precise blocking --- Grid/lattice/Lattice_reduction.h | 2 +- Grid/tensors/Tensor_class.h | 2 + Grid/tensors/Tensor_inner.h | 73 ++++++++++++++++++++++++++++++++ Grid/tensors/Tensor_traits.h | 20 +++++++++ 4 files changed, 96 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index de2efd72..d3f5f9ae 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -206,7 +206,7 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt } template strong_inline void -innerProduct_norm(ComplexD& ip, RealD &nrm, const Lattice &left,const Lattice &right) +innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Lattice &right) { conformable(left,right); diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index 75e42721..dbcbae8d 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -6,6 +6,7 @@ Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: Michael Marshall +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -55,6 +56,7 @@ class GridTensorBase {}; using Complexified = typename Traits::Complexified; \ using Realified = typename Traits::Realified; \ using DoublePrecision = typename Traits::DoublePrecision; \ + using DoublePrecision2= typename Traits::DoublePrecision2; \ static constexpr int TensorLevel = Traits::TensorLevel template diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h index 03f72966..c052adcf 100644 --- a/Grid/tensors/Tensor_inner.h +++ b/Grid/tensors/Tensor_inner.h @@ -8,6 +8,7 @@ Author: Azusa Yamaguchi Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -194,6 +195,78 @@ auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar accelerator_inline + auto innerProductD2 (const iVector& lhs,const iVector& rhs) -> iScalar +{ + typedef decltype(innerProductD2(lhs._internal[0],rhs._internal[0])) ret_t; + iScalar ret; + zeroit(ret); + for(int c1=0;c1 accelerator_inline + auto innerProductD2 (const iMatrix& lhs,const iMatrix& rhs) -> iScalar +{ + typedef decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0])) ret_t; + iScalar ret; + ret=Zero(); + for(int c1=0;c1 accelerator_inline + auto innerProductD2 (const iScalar& lhs,const iScalar& rhs) -> iScalar +{ + typedef decltype(innerProductD2(lhs._internal,rhs._internal)) ret_t; + iScalar ret; + ret._internal = innerProductD2(lhs._internal,rhs._internal); + return ret; +} + ////////////////////// // Keep same precison ////////////////////// diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 9067d43d..afb1f916 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -6,6 +6,7 @@ Author: Azusa Yamaguchi Author: Peter Boyle Author: Christopher Kelly Author: Michael Marshall +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -37,6 +38,10 @@ NAMESPACE_BEGIN(Grid); template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; + // To store double-precision data in single-precision grids for precision promoted localInnerProductD + typedef iVector vComplexD2; + typedef iVector vRealD2; + ////////////////////////////////////////////////////////////////////////////////// // Want to recurse: GridTypeMapper >::scalar_type == ComplexD. // Use of a helper class like this allows us to template specialise and "dress" @@ -81,6 +86,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef RealD DoublePrecision; + typedef RealD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; @@ -93,6 +99,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexD Complexified; typedef RealD Realified; typedef RealD DoublePrecision; + typedef RealD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; @@ -105,6 +112,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef ComplexD DoublePrecision; + typedef ComplexD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; @@ -117,6 +125,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexD Complexified; typedef RealD Realified; typedef ComplexD DoublePrecision; + typedef ComplexD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; @@ -129,6 +138,7 @@ NAMESPACE_BEGIN(Grid); typedef void Complexified; typedef void Realified; typedef void DoublePrecision; + typedef void DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { @@ -142,6 +152,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexF Complexified; typedef vRealF Realified; typedef vRealD DoublePrecision; + typedef vRealD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; @@ -154,6 +165,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexD Complexified; typedef vRealD Realified; typedef vRealD DoublePrecision; + typedef vRealD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types @@ -167,6 +179,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexH Complexified; typedef vRealH Realified; typedef vRealD DoublePrecision; + typedef vRealD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types @@ -180,6 +193,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexH Complexified; typedef vRealH Realified; typedef vComplexD DoublePrecision; + typedef vComplexD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; @@ -192,6 +206,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexF Complexified; typedef vRealF Realified; typedef vComplexD DoublePrecision; + typedef vComplexD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; @@ -204,6 +219,7 @@ NAMESPACE_BEGIN(Grid); typedef vComplexD Complexified; typedef vRealD Realified; typedef vComplexD DoublePrecision; + typedef vComplexD DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; @@ -216,6 +232,7 @@ NAMESPACE_BEGIN(Grid); typedef void Complexified; typedef void Realified; typedef void DoublePrecision; + typedef void DoublePrecision2; }; #define GridTypeMapper_RepeatedTypes \ @@ -234,6 +251,7 @@ NAMESPACE_BEGIN(Grid); using Complexified = iScalar; using Realified = iScalar; using DoublePrecision = iScalar; + using DoublePrecision2= iScalar; static constexpr int Rank = BaseTraits::Rank + 1; static constexpr std::size_t count = BaseTraits::count; static constexpr int Dimension(int dim) { @@ -248,6 +266,7 @@ NAMESPACE_BEGIN(Grid); using Complexified = iVector; using Realified = iVector; using DoublePrecision = iVector; + using DoublePrecision2= iVector; static constexpr int Rank = BaseTraits::Rank + 1; static constexpr std::size_t count = BaseTraits::count * N; static constexpr int Dimension(int dim) { @@ -262,6 +281,7 @@ NAMESPACE_BEGIN(Grid); using Complexified = iMatrix; using Realified = iMatrix; using DoublePrecision = iMatrix; + using DoublePrecision2= iMatrix; static constexpr int Rank = BaseTraits::Rank + 2; static constexpr std::size_t count = BaseTraits::count * N * N; static constexpr int Dimension(int dim) { From f1fe444d4fa205896f1d3c3b37c44567086a3b96 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 24 Apr 2020 06:27:20 -0400 Subject: [PATCH 09/51] blocked precision promotion infrastructure upgrade --- Grid/tensors/Tensor_inner.h | 1 + Grid/tensors/Tensor_traits.h | 24 +++++++++++++++++++++--- tests/Test_innerproduct_norm.cc | 4 ++-- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/Grid/tensors/Tensor_inner.h b/Grid/tensors/Tensor_inner.h index c052adcf..fd651cae 100644 --- a/Grid/tensors/Tensor_inner.h +++ b/Grid/tensors/Tensor_inner.h @@ -200,6 +200,7 @@ auto innerProductD (const iScalar& lhs,const iScalar& rhs) -> iScalar struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; - // To store double-precision data in single-precision grids for precision promoted localInnerProductD + // Traits to identify scalars + template struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; }; + template struct isGridScalar> : public std::true_type { static constexpr bool notvalue = false; }; + + // Store double-precision data in single-precision grids for precision promoted localInnerProductD + typedef iVector ComplexD2; typedef iVector vComplexD2; + typedef iVector RealD2; typedef iVector vRealD2; + // Traits to identify fundamental data types + template struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + + ////////////////////////////////////////////////////////////////////////////////// // Want to recurse: GridTypeMapper >::scalar_type == ComplexD. // Use of a helper class like this allows us to template specialise and "dress" @@ -86,7 +104,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef RealD DoublePrecision; - typedef RealD DoublePrecision2; + typedef RealD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef RealD scalar_type; @@ -112,7 +130,7 @@ NAMESPACE_BEGIN(Grid); typedef ComplexF Complexified; typedef RealF Realified; typedef ComplexD DoublePrecision; - typedef ComplexD DoublePrecision2; + typedef ComplexD2 DoublePrecision2; }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexD scalar_type; diff --git a/tests/Test_innerproduct_norm.cc b/tests/Test_innerproduct_norm.cc index 85c98521..a8718c6b 100644 --- a/tests/Test_innerproduct_norm.cc +++ b/tests/Test_innerproduct_norm.cc @@ -72,7 +72,7 @@ int main(int argc, char** argv) { sw_res.Reset(); sw_res.Start(); - for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_d_res, norm2_d_res, x_d, y_d); } + for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_d_res, norm2_d_res, x_d, y_d); } sw_res.Stop(); diff_ip_d = ip_d_ref - ip_d_res; @@ -104,7 +104,7 @@ int main(int argc, char** argv) { sw_res.Reset(); sw_res.Start(); - for(int i = 0; i < nIter; ++i) { innerProduct_norm(ip_f_res, norm2_f_res, x_f, y_f); } + for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_f_res, norm2_f_res, x_f, y_f); } sw_res.Stop(); diff_ip_f = ip_f_ref - ip_f_res; From f4033ad8cb32c34debe1623d84eab7c0d79116d5 Mon Sep 17 00:00:00 2001 From: ferben Date: Mon, 27 Apr 2020 17:46:14 +0100 Subject: [PATCH 10/51] baryon speedup by a factor 2 --- Grid/qcd/utils/BaryonUtils.h | 416 ++++++++++++++++++++++++++++++++++- 1 file changed, 407 insertions(+), 9 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index d65b9176..18d6f84b 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -46,7 +46,8 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - static const Complex epsilon_sgn[6]; + //static const Complex epsilon_sgn[6]; + static const double epsilon_sgn[6]; private: template @@ -60,6 +61,62 @@ public: const int parity, const int * wick_contractions, robj &result); + template + static void baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + template + static void baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const int * wick_contractions, + robj &result); + template + static inline void baryon_site_template(unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + template + static inline void baryon_site_template(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + + template + struct BaryonSiteHelper + { + template + static inline void function(const unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result); + }; public: static void ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -151,14 +208,18 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -template +/*template const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), Complex(1), Complex(1), Complex(-1), Complex(-1), Complex(-1)}; +*/ +template +const double BaryonUtils::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0}; +//This is the old version template template void BaryonUtils::baryon_site(const mobj &D1, @@ -188,13 +249,15 @@ void BaryonUtils::baryon_site(const mobj &D1, int a_right = epsilon[ie_right][0]; //a' int b_right = epsilon[ie_right][1]; //b' int c_right = epsilon[ie_right][2]; //c' + //complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; //This is the \delta_{456}^{123} part if (wick_contraction[0]){ auto D2g = D2 * GammaB_left; for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right::baryon_site(const mobj &D1, for (int alpha_right=0; alpha_right +template +void BaryonUtils::baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + + auto gD1a = GammaA_left * GammaA_right * D1; + auto gD1b = GammaA_left * g4 * GammaA_right * D1; + auto pD1 = 0.5* (gD1a + (double)parity * gD1b); + auto gD3 = GammaB_right * D3; + + auto D2g = D2 * GammaB_left; + auto pD1g = pD1 * GammaB_left; + auto gD3g = gD3 * GammaB_left; + + for (int ie_left=0; ie_left < 6 ; ie_left++){ + int a_left = epsilon[ie_left][0]; //a + int b_left = epsilon[ie_left][1]; //b + int c_left = epsilon[ie_left][2]; //c + for (int ie_right=0; ie_right < 6 ; ie_right++){ + int a_right = epsilon[ie_right][0]; //a' + int b_right = epsilon[ie_right][1]; //b' + int c_right = epsilon[ie_right][2]; //c' + double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + //All parts together + for (int gamma_left=0; gamma_left( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\ +} + +template +template +void BaryonUtils::baryon_site_macro(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const int * wick_contraction, + robj &result) +{ +BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); +BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); + +} + + +template +template +inline void BaryonUtils::baryon_site_template(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5); + constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4); + constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3); + constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2); + constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1); + constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0); + + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + + auto gD1a = GammaA_left * GammaA_right * D1; + auto gD1b = GammaA_left * g4 * GammaA_right * D1; + auto pD1 = 0.5* (gD1a + (double)parity * gD1b); + auto gD3 = GammaB_right * D3; + + auto D2g = D2 * GammaB_left; + auto pD1g = pD1 * GammaB_left; + auto gD3g = gD3 * GammaB_left; + + for (int ie_left=0; ie_left < 6 ; ie_left++){ + int a_left = epsilon[ie_left][0]; //a + int b_left = epsilon[ie_left][1]; //b + int c_left = epsilon[ie_left][2]; //c + for (int ie_right=0; ie_right < 6 ; ie_right++){ + int a_right = epsilon[ie_right][0]; //a' + int b_right = epsilon[ie_right][1]; //b' + int c_right = epsilon[ie_right][2]; //c' + double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + //All parts together + for (int gamma_left=0; gamma_left +template +template +inline void BaryonUtils::BaryonSiteHelper::function(const unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + if (mask == maxMask) + { + baryon_site_template(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); + } + else + { + BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); + } +} + +// top-level function +template +template +inline void BaryonUtils::baryon_site_template(const unsigned int mask, const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + robj &result) +{ + BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); +} + + template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -259,6 +593,10 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const int parity, ComplexField &baryon_corr) { + const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() }; + std::time_t now = std::chrono::system_clock::to_time_t( start ); + std::cout << "Setup start " << std::ctime( &now ); + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; @@ -278,6 +616,16 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto v2 = q2_left.View(); auto v3 = q3_left.View(); + const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop ); + const std::chrono::duration duration_seconds = stop - start; + const double seconds{ ( duration_seconds.count() ) }; + std::cout << "Setup stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl; + + const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( start2 ); + std::cout << "Normal Loop start " << std::ctime( &now ); // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ //for(int ss=0; ss < grid->oSites(); ss++){ @@ -290,6 +638,56 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites + const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop2 ); + const std::chrono::duration duration_seconds2 = stop2 - start2; + const double seconds2{ ( duration_seconds2.count() ) }; + std::cout << "Normal Loop stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl; + const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( start4 ); + std::cout << "Opt-macro Loop start " << std::ctime( &now ); + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + thread_for(ss,grid->oSites(),{ + //for(int ss=0; ss < grid->oSites(); ss++){ + + auto D1 = v1[ss]; + auto D2 = v2[ss]; + auto D3 = v3[ss]; + + vobj result=Zero(); + baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + vbaryon_corr[ss] = result; + } );//end loop over lattice sites + const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop4 ); + const std::chrono::duration duration_seconds4 = stop4 - start4; + const double seconds4{ ( duration_seconds4.count() ) }; + std::cout << "Opt-macro Loop stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; + const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( start3 ); + int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5]; + std::cout << "Opt-template Loop start " << std::ctime( &now ); + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + thread_for(ss,grid->oSites(),{ + //for(int ss=0; ss < grid->oSites(); ss++){ + + auto D1 = v1[ss]; + auto D2 = v2[ss]; + auto D3 = v3[ss]; + + vobj result=Zero(); + baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); + vbaryon_corr[ss] = result; + } );//end loop over lattice sites + const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() }; + now = std::chrono::system_clock::to_time_t( stop3 ); + const std::chrono::duration duration_seconds3 = stop3 - start3; + const double seconds3{ ( duration_seconds3.count() ) }; + std::cout << "Opt-template Loop stop " << std::ctime( &now ) + << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl; + } template template @@ -318,7 +716,7 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); } /*********************************************************************** From 6240e02619be0b0c5bda84173c0dc6cba62aef84 Mon Sep 17 00:00:00 2001 From: ferben Date: Mon, 27 Apr 2020 18:50:53 +0100 Subject: [PATCH 11/51] added assertion to avoid potential infinite loop --- Grid/qcd/utils/BaryonUtils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 18d6f84b..a392f223 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -553,6 +553,7 @@ inline void BaryonUtils::BaryonSiteHelper::function(const unsign const int parity, robj &result) { + assert(mask <= maxMask); if (mask == maxMask) { baryon_site_template(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); From dee96cbf8296b9f16dc378c78cdcb74302da77c5 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Wed, 29 Apr 2020 10:37:11 -0400 Subject: [PATCH 12/51] Added workaround in configure to still catch Cuda compiler when nvcc with extra arguments (eg -ccbin) is used as CXX --- configure.ac | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index fb69ca0e..46559507 100644 --- a/configure.ac +++ b/configure.ac @@ -274,12 +274,20 @@ case ${ac_gen_scalar} in esac ##################### Compiler dependent choices -case ${CXX} in + +#Strip any optional compiler arguments from nvcc call (eg -ccbin) for compiler comparison +CXXBASE=${CXX} +CXXTEST=${CXX} +if echo "${CXX}" | grep -q "nvcc"; then + CXXTEST="nvcc" +fi + +case ${CXXTEST} in nvcc) # CXX="nvcc -keep -v -x cu " # CXXLD="nvcc -v -link" - CXX="nvcc -x cu " - CXXLD="nvcc -link" + CXX="${CXXBASE} -x cu " + CXXLD="${CXXBASE} -link" # CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing -Xcompiler -Wno-unusable-partial-specialization --expt-extended-lambda --expt-relaxed-constexpr" CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" if test $ac_openmp = yes; then From dbaeefaeef334eabd3feeb73c70da13c9a548bcf Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Thu, 30 Apr 2020 15:02:51 +0100 Subject: [PATCH 13/51] All Eigen::TensorMap objects are fixed (i.e. cannot be dynamically resized) --- Grid/serialisation/BaseIO.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h index bf424fc7..49406201 100644 --- a/Grid/serialisation/BaseIO.h +++ b/Grid/serialisation/BaseIO.h @@ -87,11 +87,7 @@ namespace Grid { template struct is_tensor_fixed> : public std::true_type {}; - template class MapPointer_> - struct is_tensor_fixed, MapOptions_, MapPointer_>> - : public std::true_type {}; + template struct is_tensor_fixed> : public std::true_type {}; // Is this a variable-size Eigen tensor template struct is_tensor_variable : public std::false_type {}; From 4a4b9e305d39dc4be0d1497eab81d432a831dbf4 Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Thu, 30 Apr 2020 19:40:04 +0100 Subject: [PATCH 14/51] Fix: strToVec enters infinite loop and exhausts memory if operator>> fails before the end of string, e.g. if parsing "0_0_0" for momentum instead of "0 0 0". --- Grid/serialisation/VectorUtils.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Grid/serialisation/VectorUtils.h b/Grid/serialisation/VectorUtils.h index a5a73992..dd5ff0b8 100644 --- a/Grid/serialisation/VectorUtils.h +++ b/Grid/serialisation/VectorUtils.h @@ -432,12 +432,10 @@ namespace Grid { std::vector strToVec(const std::string s) { std::istringstream sstr(s); - T buf; std::vector v; - while(!sstr.eof()) + for(T buf; sstr >> buf;) { - sstr >> buf; v.push_back(buf); } From ddb192bac73f7d0b2329f8bd43fe4d64e31f90e1 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 30 Apr 2020 16:09:57 -0400 Subject: [PATCH 15/51] re-work double precision promotion for summit --- Grid/lattice/Lattice_reduction.h | 2 +- Grid/parallelIO/NerscIO.h | 2 +- Grid/tensors/Tensor_traits.h | 92 ++++++++++++++++++++++++++++++-- 3 files changed, 90 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index d3f5f9ae..e2690cfe 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -234,7 +234,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti accelerator_for( ss, sites, nsimd,{ auto left_tmp = left_v(ss); coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss))); - coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp))); + coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp)); }); tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites)); diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h index d3b62d1f..5522ba91 100644 --- a/Grid/parallelIO/NerscIO.h +++ b/Grid/parallelIO/NerscIO.h @@ -146,7 +146,7 @@ public: int ieee32big = (format == std::string("IEEE32BIG")); int ieee32 = (format == std::string("IEEE32")); int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); + int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE")); uint32_t nersc_csum,scidac_csuma,scidac_csumb; // depending on datatype, set up munger; diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 5359c547..04d7343e 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -43,10 +43,38 @@ NAMESPACE_BEGIN(Grid); template struct isGridScalar> : public std::true_type { static constexpr bool notvalue = false; }; // Store double-precision data in single-precision grids for precision promoted localInnerProductD - typedef iVector ComplexD2; - typedef iVector vComplexD2; - typedef iVector RealD2; - typedef iVector vRealD2; + template + class TypePair { + public: + T _internal[2]; + TypePair& operator=(const Grid::Zero& o) { + _internal[0] = Zero(); + _internal[1] = Zero(); + return *this; + } + + TypePair operator+(const TypePair& o) const { + TypePair r; + r._internal[0] = _internal[0] + o._internal[0]; + r._internal[1] = _internal[1] + o._internal[1]; + return r; + } + + TypePair& operator+=(const TypePair& o) { + _internal[0] += o._internal[0]; + _internal[1] += o._internal[1]; + return *this; + } + + friend accelerator_inline void add(TypePair* ret, const TypePair* a, const TypePair* b) { + add(&ret->_internal[0],&a->_internal[0],&b->_internal[0]); + add(&ret->_internal[1],&a->_internal[1],&b->_internal[1]); + } + }; + typedef TypePair ComplexD2; + typedef TypePair RealD2; + typedef TypePair vComplexD2; + typedef TypePair vRealD2; // Traits to identify fundamental data types template struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; }; @@ -58,6 +86,10 @@ NAMESPACE_BEGIN(Grid); template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; + template<> struct isGridFundamental : public std::true_type { static constexpr bool notvalue = false; }; ////////////////////////////////////////////////////////////////////////////////// @@ -119,6 +151,19 @@ NAMESPACE_BEGIN(Grid); typedef RealD DoublePrecision; typedef RealD DoublePrecision2; }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef RealD2 scalar_type; + typedef RealD2 scalar_typeD; + typedef RealD2 vector_type; + typedef RealD2 vector_typeD; + typedef RealD2 tensor_reduced; + typedef RealD2 scalar_object; + typedef RealD2 scalar_objectD; + typedef ComplexD2 Complexified; + typedef RealD2 Realified; + typedef RealD2 DoublePrecision; + typedef RealD2 DoublePrecision2; + }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef ComplexF scalar_type; typedef ComplexD scalar_typeD; @@ -145,6 +190,19 @@ NAMESPACE_BEGIN(Grid); typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision2; }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef ComplexD2 scalar_type; + typedef ComplexD2 scalar_typeD; + typedef ComplexD2 vector_type; + typedef ComplexD2 vector_typeD; + typedef ComplexD2 tensor_reduced; + typedef ComplexD2 scalar_object; + typedef ComplexD2 scalar_objectD; + typedef ComplexD2 Complexified; + typedef RealD2 Realified; + typedef ComplexD2 DoublePrecision; + typedef ComplexD2 DoublePrecision2; + }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; typedef Integer scalar_typeD; @@ -185,6 +243,19 @@ NAMESPACE_BEGIN(Grid); typedef vRealD DoublePrecision; typedef vRealD DoublePrecision2; }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef RealD2 scalar_type; + typedef RealD2 scalar_typeD; + typedef vRealD2 vector_type; + typedef vRealD2 vector_typeD; + typedef vRealD2 tensor_reduced; + typedef RealD2 scalar_object; + typedef RealD2 scalar_objectD; + typedef vComplexD2 Complexified; + typedef vRealD2 Realified; + typedef vRealD2 DoublePrecision; + typedef vRealD2 DoublePrecision2; + }; template<> struct GridTypeMapper : public GridTypeMapper_Base { // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types typedef RealF scalar_type; @@ -239,6 +310,19 @@ NAMESPACE_BEGIN(Grid); typedef vComplexD DoublePrecision; typedef vComplexD DoublePrecision2; }; + template<> struct GridTypeMapper : public GridTypeMapper_Base { + typedef ComplexD2 scalar_type; + typedef ComplexD2 scalar_typeD; + typedef vComplexD2 vector_type; + typedef vComplexD2 vector_typeD; + typedef vComplexD2 tensor_reduced; + typedef ComplexD2 scalar_object; + typedef ComplexD2 scalar_objectD; + typedef vComplexD2 Complexified; + typedef vRealD2 Realified; + typedef vComplexD2 DoublePrecision; + typedef vComplexD2 DoublePrecision2; + }; template<> struct GridTypeMapper : public GridTypeMapper_Base { typedef Integer scalar_type; typedef Integer scalar_typeD; From c8af498a2abb195f606b389a12824bf3e65315f7 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 1 May 2020 03:45:50 -0400 Subject: [PATCH 16/51] BinaryIO fix for alternative little-endian format name (used in 96I ensemble) --- Grid/parallelIO/BinaryIO.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h index f90c34a9..1f11add9 100644 --- a/Grid/parallelIO/BinaryIO.h +++ b/Grid/parallelIO/BinaryIO.h @@ -341,7 +341,7 @@ class BinaryIO { int ieee32big = (format == std::string("IEEE32BIG")); int ieee32 = (format == std::string("IEEE32")); int ieee64big = (format == std::string("IEEE64BIG")); - int ieee64 = (format == std::string("IEEE64")); + int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE")); assert(ieee64||ieee32|ieee64big||ieee32big); assert((ieee64+ieee32+ieee64big+ieee32big)==1); ////////////////////////////////////////////////////////////////////////////// From 63cf201ee7ad32271537e5bbfe70d4df049a83f5 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 2 May 2020 11:38:42 -0400 Subject: [PATCH 17/51] Add AdviseInfrequentUse --- Grid/lattice/Lattice_base.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index ec7c54ec..a16b4fa8 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -9,6 +9,7 @@ Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: paboyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -75,6 +76,14 @@ public: if (grid) conformable(grid, _grid); else grid = _grid; }; + + // Advise that the data is used infrequently. This can + // significantly influence performance of bulk storage. + accelerator_inline void AdviseInfrequentUse() { +#ifdef __CUDA_ARCH__ + cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); +#endif + }; }; ///////////////////////////////////////////////////////////////////////////////////////// From 949be9605cf2d801439ec38ad85aa9554b29eb58 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 2 May 2020 16:20:03 -0400 Subject: [PATCH 18/51] fix pragmas --- Grid/lattice/Lattice_base.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index a16b4fa8..271ab43e 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -80,8 +80,10 @@ public: // Advise that the data is used infrequently. This can // significantly influence performance of bulk storage. accelerator_inline void AdviseInfrequentUse() { -#ifdef __CUDA_ARCH__ +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); +#endif #endif }; }; From 38532753f40a28b98d648e0896138f9fab9c3cf9 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 3 May 2020 08:58:32 -0400 Subject: [PATCH 19/51] interface cleanup --- Grid/lattice/Lattice_base.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 271ab43e..1e8b4152 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -50,6 +50,14 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) assert(lhs == rhs); } +//////////////////////////////////////////////////////////////////////////// +// Advise for memory management +//////////////////////////////////////////////////////////////////////////// +enum LatticeAcceleratorAdvise { + AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can + // significantly influence performance of bulk storage. +}; + //////////////////////////////////////////////////////////////////////////// // Minimal base class containing only data valid to access from accelerator // _odata will be a managed pointer in CUDA @@ -77,12 +85,12 @@ public: else grid = _grid; }; - // Advise that the data is used infrequently. This can - // significantly influence performance of bulk storage. - accelerator_inline void AdviseInfrequentUse() { + accelerator_inline void Advise(int advise) { #ifdef GRID_NVCC #ifndef __CUDA_ARCH__ // only on host - cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); + if (advise & AdviseInfrequentUse) { + cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); + } #endif #endif }; From 9bfa51bffbd3550b4ee2f3cd7594ccb5bbe75535 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 3 May 2020 09:12:52 -0400 Subject: [PATCH 20/51] cleanup comment --- Grid/lattice/Lattice_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 1e8b4152..157c647b 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -51,7 +51,7 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) } //////////////////////////////////////////////////////////////////////////// -// Advise for memory management +// Advise the LatticeAccelerator class //////////////////////////////////////////////////////////////////////////// enum LatticeAcceleratorAdvise { AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can From 2a1387e992781258ffaf636142bb0bb13b168997 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 3 May 2020 17:27:11 -0400 Subject: [PATCH 21/51] rankInnerProduct --- Grid/lattice/Lattice_reduction.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index e2690cfe..8acbde66 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -5,6 +5,7 @@ Author: Azusa Yamaguchi Author: Peter Boyle Author: paboyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -93,7 +94,7 @@ template inline RealD norm2(const Lattice &arg){ // Double inner product template -inline ComplexD innerProduct(const Lattice &left,const Lattice &right) +inline ComplexD rankInnerProduct(const Lattice &left,const Lattice &right) { typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; @@ -137,11 +138,18 @@ inline ComplexD innerProduct(const Lattice &left,const Lattice &righ }) nrm = TensorRemove(sum(inner_tmp_v,sites)); #endif - grid->GlobalSum(nrm); - return nrm; } +template +inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { + GridBase *grid = left.Grid(); + ComplexD nrm = rankInnerProduct(left,right); + grid->GlobalSum(nrm); + return nrm; +} + + ///////////////////////// // Fast axpby_norm // z = a x + b y From 04863f8f3835bbfec9f913139eddc1dca717bbb2 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Mon, 4 May 2020 16:07:03 -0400 Subject: [PATCH 22/51] debug new AcceleratorView --- Grid/lattice/Lattice_ET.h | 3 +- Grid/lattice/Lattice_arith.h | 69 ++++++++++++++++--------------- Grid/lattice/Lattice_base.h | 71 +++++++++++++++++++++++++++----- Grid/lattice/Lattice_reduction.h | 14 +++---- 4 files changed, 105 insertions(+), 52 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index cf7147b9..da63d5e6 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -9,6 +9,7 @@ Copyright (C) 2015 Author: Azusa Yamaguchi Author: Peter Boyle Author: neo +Author: Christoph Lehner &arg) template accelerator_inline const lobj & eval(const uint64_t ss, const Lattice &arg) { - auto view = arg.View(); + auto view = arg.AcceleratorView(ViewRead); return view[ss]; } diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 3543d6aa..c4a67620 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -7,6 +7,7 @@ Copyright (C) 2015 Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid); template inline void mult(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); conformable(ret,rhs); conformable(lhs,rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ @@ -55,9 +56,9 @@ void mac(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -72,9 +73,9 @@ void sub(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -88,9 +89,9 @@ void add(Lattice &ret,const Lattice &lhs,const Lattice &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,rhs); conformable(lhs,rhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); - auto rhs_v = rhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -107,8 +108,8 @@ template inline void mult(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs_v(ss),&rhs); @@ -120,8 +121,8 @@ template inline void mac(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -134,8 +135,8 @@ template inline void sub(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(ret,lhs); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -147,8 +148,8 @@ template inline void add(Lattice &ret,const Lattice &lhs,const obj3 &rhs){ ret.Checkerboard() = lhs.Checkerboard(); conformable(lhs,ret); - auto ret_v = ret.View(); - auto lhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto lhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto lhs_t=lhs_v(ss); @@ -164,8 +165,8 @@ template inline void mult(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -178,8 +179,8 @@ template inline void mac(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -192,8 +193,8 @@ template inline void sub(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -205,8 +206,8 @@ template inline void add(Lattice &ret,const obj2 &lhs,const Lattice &rhs){ ret.Checkerboard() = rhs.Checkerboard(); conformable(ret,rhs); - auto ret_v = ret.View(); - auto rhs_v = lhs.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto rhs_v = lhs.AcceleratorView(ViewRead); accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ decltype(coalescedRead(obj1())) tmp; auto rhs_t=rhs_v(ss); @@ -220,9 +221,9 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto x_v = x.AcceleratorView(ViewRead); + auto y_v = y.AcceleratorView(ViewRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+y_v(ss); coalescedWrite(ret_v[ss],tmp); @@ -233,9 +234,9 @@ void axpby(Lattice &ret,sobj a,sobj b,const Lattice &x,const Lattice ret.Checkerboard() = x.Checkerboard(); conformable(ret,x); conformable(x,y); - auto ret_v = ret.View(); - auto x_v = x.View(); - auto y_v = y.View(); + auto ret_v = ret.AcceleratorView(ViewWrite); + auto x_v = x.AcceleratorView(ViewRead); + auto y_v = y.AcceleratorView(ViewRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ auto tmp = a*x_v(ss)+b*y_v(ss); coalescedWrite(ret_v[ss],tmp); diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 157c647b..30aa6b06 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -54,8 +54,20 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) // Advise the LatticeAccelerator class //////////////////////////////////////////////////////////////////////////// enum LatticeAcceleratorAdvise { - AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can - // significantly influence performance of bulk storage. + AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can + // significantly influence performance of bulk storage. + AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures + // enables read-only copies of memory to be kept on + // host and device. +}; + +//////////////////////////////////////////////////////////////////////////// +// View Access Mode +//////////////////////////////////////////////////////////////////////////// +enum ViewMode { + ViewRead = 0x1, + ViewWrite = 0x2, + ViewReadWrite = 0x3 }; //////////////////////////////////////////////////////////////////////////// @@ -91,6 +103,29 @@ public: if (advise & AdviseInfrequentUse) { cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId); } + if (advise & AdviseReadMostly) { + cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1); + } +#endif +#endif + }; + + accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + int target; + cudaGetDevice(&target); + cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); + std::cout<< GridLogMessage << "To Device " << target << std::endl; +#endif +#endif + }; + + accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future +#ifdef GRID_NVCC +#ifndef __CUDA_ARCH__ // only on host + cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); + std::cout<< GridLogMessage << "To Host" << std::endl; #endif #endif }; @@ -225,9 +260,23 @@ public: // The view is trivially copy constructible and may be copied to an accelerator device // in device lambdas ///////////////////////////////////////////////////////////////////////////////// - LatticeView View (void) const + LatticeView View (void) const // deprecated, should pick AcceleratorView for accelerator_for + { // and HostView for thread_for + LatticeView accessor(*( (LatticeAccelerator *) this)); + return accessor; + } + + LatticeView AcceleratorView(int mode = ViewReadWrite) const { LatticeView accessor(*( (LatticeAccelerator *) this)); + accessor.AcceleratorPrefetch(mode); + return accessor; + } + + LatticeView HostView(int mode = ViewReadWrite) const + { + LatticeView accessor(*( (LatticeAccelerator *) this)); + accessor.HostPrefetch(mode); return accessor; } @@ -251,7 +300,7 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -270,7 +319,7 @@ public: assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -288,7 +337,7 @@ public: CBFromExpression(cb,expr); assert( (cb==Odd) || (cb==Even)); this->checkerboard=cb; - auto me = View(); + auto me = AcceleratorView(ViewWrite); accelerator_for(ss,me.size(),1,{ auto tmp = eval(ss,expr); vstream(me[ss],tmp); @@ -399,8 +448,9 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - auto me = View(); - auto him= r.View(); + std::cout << GridLogMessage << "Copy other" << std::endl; + auto me = AcceleratorView(ViewWrite); + auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); @@ -413,8 +463,9 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - auto me = View(); - auto him= r.View(); + std::cout << GridLogMessage << "Copy same" << std::endl; + auto me = AcceleratorView(ViewWrite); + auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ coalescedWrite(me[ss],him(ss)); }); diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 8acbde66..3147823d 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -103,8 +103,8 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & GridBase *grid = left.Grid(); // Might make all code paths go this way. - auto left_v = left.View(); - auto right_v=right.View(); + auto left_v = left.AcceleratorView(ViewRead); + auto right_v=right.AcceleratorView(ViewRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -175,9 +175,9 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt GridBase *grid = x.Grid(); - auto x_v=x.View(); - auto y_v=y.View(); - auto z_v=z.View(); + auto x_v=x.AcceleratorView(ViewRead); + auto y_v=y.AcceleratorView(ViewRead); + auto z_v=z.AcceleratorView(ViewWrite); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); @@ -224,8 +224,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti GridBase *grid = left.Grid(); - auto left_v=left.View(); - auto right_v=right.View(); + auto left_v=left.AcceleratorView(ViewRead); + auto right_v=right.AcceleratorView(ViewRead); const uint64_t nsimd = grid->Nsimd(); const uint64_t sites = grid->oSites(); From 6b64727161b4328d54ed919be2979ba518614c37 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 5 May 2020 05:05:36 -0400 Subject: [PATCH 23/51] disable comments --- Grid/lattice/Lattice_base.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 30aa6b06..0b03dea0 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -116,7 +116,7 @@ public: int target; cudaGetDevice(&target); cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); - std::cout<< GridLogMessage << "To Device " << target << std::endl; + //std::cout<< GridLogMessage << "To Device " << target << std::endl; #endif #endif }; @@ -125,7 +125,7 @@ public: #ifdef GRID_NVCC #ifndef __CUDA_ARCH__ // only on host cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); - std::cout<< GridLogMessage << "To Host" << std::endl; + //std::cout<< GridLogMessage << "To Host" << std::endl; #endif #endif }; @@ -448,7 +448,7 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - std::cout << GridLogMessage << "Copy other" << std::endl; + //std::cout << GridLogMessage << "Copy other" << std::endl; auto me = AcceleratorView(ViewWrite); auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ @@ -463,7 +463,7 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - std::cout << GridLogMessage << "Copy same" << std::endl; + //std::cout << GridLogMessage << "Copy same" << std::endl; auto me = AcceleratorView(ViewWrite); auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ From e9b295f967c03bfe200880cd834721b2583dced5 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 6 May 2020 08:42:28 -0400 Subject: [PATCH 24/51] Synchronize blocking infrastructure with GPT --- .../iterative/ImplicitlyRestartedLanczos.h | 205 ------------- Grid/lattice/Lattice.h | 2 +- Grid/lattice/Lattice_base.h | 5 - Grid/lattice/Lattice_transfer.h | 286 +++++++++++++----- 4 files changed, 216 insertions(+), 282 deletions(-) diff --git a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h index 8bee43cc..3d0a2a75 100644 --- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h +++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h @@ -37,211 +37,6 @@ Author: Christoph Lehner NAMESPACE_BEGIN(Grid); - //////////////////////////////////////////////////////// - // Move following 100 LOC to lattice/Lattice_basis.h - //////////////////////////////////////////////////////// -template -void basisOrthogonalize(std::vector &basis,Field &w,int k) -{ - // If assume basis[j] are already orthonormal, - // can take all inner products in parallel saving 2x bandwidth - // Save 3x bandwidth on the second line of loop. - // perhaps 2.5x speed up. - // 2x overall in Multigrid Lanczos - for(int j=0; j -void basisRotate(std::vector &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) -{ - typedef decltype(basis[0].View()) View; - auto tmp_v = basis[0].View(); - Vector basis_v(basis.size(),tmp_v); - typedef typename Field::vector_object vobj; - GridBase* grid = basis[0].Grid(); - - for(int k=0;k > Bt(thread_max() * Nm); // Thread private - thread_region - { - vobj* B = Bt.data() + Nm * thread_num(); - - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(); - uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead - - // printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock); - - Vector Bt(siteBlock * nrot); - auto Bp=&Bt[0]; - - // GPU readable copy of Eigen matrix - Vector Qt_jv(Nm*Nm); - double *Qt_p = & Qt_jv[0]; - for(int k=0;k -void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) -{ - typedef decltype(basis[0].View()) View; - typedef typename Field::vector_object vobj; - GridBase* grid = basis[0].Grid(); - - result.Checkerboard() = basis[0].Checkerboard(); - auto result_v=result.View(); - Vector basis_v(basis.size(),result_v); - for(int k=0;k Qt_jv(Nm); - double * Qt_j = & Qt_jv[0]; - for(int k=0;koSites(),vobj::Nsimd(),{ - auto B=coalescedRead(zz); - for(int k=k0; k -void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) -{ - int vlen = idx.size(); - - assert(vlen>=1); - assert(vlen<=sort_vals.size()); - assert(vlen<=_v.size()); - - for (size_t i=0;ii for which _vnew[j] = _vold[i], - // track the move idx[j] => idx[i] - // track the move idx[i] => i - ////////////////////////////////////// - size_t j; - for (j=i;j i); assert(j!=idx.size()); assert(idx[j]==i); - - swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy - std::swap(sort_vals[i],sort_vals[idx[i]]); - - idx[j] = idx[i]; - idx[i] = i; - } - } -} - -inline std::vector basisSortGetIndex(std::vector& sort_vals) -{ - std::vector idx(sort_vals.size()); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { - return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); - }); - return idx; -} - -template -void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) -{ - std::vector idx = basisSortGetIndex(sort_vals); - if (reverse) - std::reverse(idx.begin(), idx.end()); - - basisReorderInPlace(_v,sort_vals,idx); -} - -// PAB: faster to compute the inner products first then fuse loops. -// If performance critical can improve. -template -void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { - result = Zero(); - assert(_v.size()==eval.size()); - int N = (int)_v.size(); - for (int i=0;i #include #include #include - +#include diff --git a/Grid/lattice/Lattice_base.h b/Grid/lattice/Lattice_base.h index 0b03dea0..74525cc1 100644 --- a/Grid/lattice/Lattice_base.h +++ b/Grid/lattice/Lattice_base.h @@ -116,7 +116,6 @@ public: int target; cudaGetDevice(&target); cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target); - //std::cout<< GridLogMessage << "To Device " << target << std::endl; #endif #endif }; @@ -125,7 +124,6 @@ public: #ifdef GRID_NVCC #ifndef __CUDA_ARCH__ // only on host cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId); - //std::cout<< GridLogMessage << "To Host" << std::endl; #endif #endif }; @@ -425,7 +423,6 @@ public: // copy constructor /////////////////////////////////////////// Lattice(const Lattice& r){ - // std::cout << "Lattice constructor(const Lattice &) "<_grid = r.Grid(); resize(this->_grid->oSites()); *this = r; @@ -448,7 +445,6 @@ public: typename std::enable_if::value,int>::type i=0; conformable(*this,r); this->checkerboard = r.Checkerboard(); - //std::cout << GridLogMessage << "Copy other" << std::endl; auto me = AcceleratorView(ViewWrite); auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ @@ -463,7 +459,6 @@ public: inline Lattice & operator = (const Lattice & r){ this->checkerboard = r.Checkerboard(); conformable(*this,r); - //std::cout << GridLogMessage << "Copy same" << std::endl; auto me = AcceleratorView(ViewWrite); auto him= r.AcceleratorView(ViewRead); accelerator_for(ss,me.size(),vobj::Nsimd(),{ diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c80e7db2..c23ddcdc 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -6,6 +6,7 @@ Copyright (C) 2015 Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -63,6 +64,7 @@ template inline void pickCheckerboard(int cb,Lattice &half,con } }); } + template inline void setCheckerboard(Lattice &full,const Lattice &half){ int cb = half.Checkerboard(); auto half_v = half.View(); @@ -81,25 +83,130 @@ template inline void setCheckerboard(Lattice &full,const Latti } }); } - -template + +//////////////////////////////////////////////////////////////////////////////////////////// +// Flexible Type Conversion for internal promotion to double as well as graceful +// treatment of scalar-compatible types +//////////////////////////////////////////////////////////////////////////////////////////// +accelerator_inline void convertType(ComplexD & out, const std::complex & in) { + out = in; +} + +accelerator_inline void convertType(ComplexF & out, const std::complex & in) { + out = in; +} + +#ifdef __CUDA_ARCH__ +accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { + ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; +} +accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { + ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in; +} +accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { + ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in; +} +#endif + +accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) { + out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v); +} + +accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { + Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); +} + +template + accelerator_inline void convertType(iMatrix & out, const iMatrix & in); +template + accelerator_inline void convertType(iVector & out, const iVector & in); + +template::value, T1>::type* = nullptr> +accelerator_inline void convertType(T1 & out, const iScalar & in) { + convertType(out,in._internal); +} + +template +accelerator_inline void convertType(iScalar & out, const T2 & in) { + convertType(out._internal,in); +} + +template +accelerator_inline void convertType(iMatrix & out, const iMatrix & in) { + for (int i=0;i +accelerator_inline void convertType(iVector & out, const iVector & in) { + for (int i=0;i::value, T>::type* = nullptr> +accelerator_inline void convertType(T & out, const T & in) { + out = in; +} + +template +accelerator_inline void convertType(Lattice & out, const Lattice & in) { + auto out_v = out.AcceleratorView(ViewWrite); + auto in_v = in.AcceleratorView(ViewRead); + + accelerator_for(ss,out_v.size(),T1::Nsimd(),{ + convertType(out_v[ss],in_v(ss)); + }); +} + +//////////////////////////////////////////////////////////////////////////////////////////// +// precision-promoted local inner product +//////////////////////////////////////////////////////////////////////////////////////////// +template +inline auto localInnerProductD(const Lattice &lhs,const Lattice &rhs) +-> Lattice> +{ + auto lhs_v = lhs.AcceleratorView(ViewRead); + auto rhs_v = rhs.AcceleratorView(ViewRead); + + typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; + Lattice> ret(lhs.Grid()); + auto ret_v = ret.AcceleratorView(ViewWrite); + + accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ + convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); + }); + + return ret; +} + +//////////////////////////////////////////////////////////////////////////////////////////// +// block routines +//////////////////////////////////////////////////////////////////////////////////////////// +template inline void blockProject(Lattice > &coarseData, - const Lattice &fineData, - const std::vector > &Basis) + const Lattice &fineData, + const VLattice &Basis) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); - Lattice ip(coarse); + Lattice> ip(coarse); + Lattice fineDataRed = fineData; // auto fineData_ = fineData.View(); - auto coarseData_ = coarseData.View(); - auto ip_ = ip.View(); + auto coarseData_ = coarseData.AcceleratorView(ViewWrite); + auto ip_ = ip.AcceleratorView(ViewReadWrite); for(int v=0;v accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { - coalescedWrite(coarseData_[sc](v),ip_(sc)); + convertType(coarseData_[sc](v),ip_[sc]); }); + + // improve numerical stability of projection + // |fine> = |fine> - |basis> + ip=-ip; + blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); } } @@ -166,11 +273,11 @@ inline void blockProject1(Lattice > &coarseData, return; } -template -inline void blockZAXPY(Lattice &fineZ, - const Lattice &coarseA, - const Lattice &fineX, - const Lattice &fineY) +template + inline void blockZAXPY(Lattice &fineZ, + const Lattice &coarseA, + const Lattice &fineX, + const Lattice &fineY) { GridBase * fine = fineZ.Grid(); GridBase * coarse= coarseA.Grid(); @@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice &fineZ, conformable(fineX,fineZ); int _ndimension = coarse->_ndimension; - + Coordinate block_r (_ndimension); // FIXME merge with subdivide checking routine as this is redundant @@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice &fineZ, assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); } - auto fineZ_ = fineZ.View(); - auto fineX_ = fineX.View(); - auto fineY_ = fineY.View(); - auto coarseA_= coarseA.View(); + auto fineZ_ = fineZ.AcceleratorView(ViewWrite); + auto fineX_ = fineX.AcceleratorView(ViewRead); + auto fineY_ = fineY.AcceleratorView(ViewRead); + auto coarseA_= coarseA.AcceleratorView(ViewRead); accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { - - int sc; - Coordinate coor_c(_ndimension); - Coordinate coor_f(_ndimension); - Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); - for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; - Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); + int sc; + Coordinate coor_c(_ndimension); + Coordinate coor_f(_ndimension); - // z = A x + y - coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); + Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); + for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; + Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); - }); + // z = A x + y +#ifdef __CUDA_ARCH__ + typename vobj2::tensor_reduced::scalar_object cA; + typename vobj::scalar_object cAx; +#else + typename vobj2::tensor_reduced cA; + vobj cAx; +#endif + convertType(cA,TensorRemove(coarseA_(sc))); + auto prod = cA*fineX_(sf); + convertType(cAx,prod); + coalescedWrite(fineZ_[sf],cAx+fineY_(sf)); + + }); return; } + template + inline void blockInnerProductD(Lattice &CoarseInner, + const Lattice &fineX, + const Lattice &fineY) +{ + typedef iScalar dotp; + + GridBase *coarse(CoarseInner.Grid()); + GridBase *fine (fineX.Grid()); + + Lattice fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); + Lattice coarse_inner(coarse); + + auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); + auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); + + // Precision promotion + fine_inner = localInnerProductD(fineX,fineY); + blockSum(coarse_inner,fine_inner); + accelerator_for(ss, coarse->oSites(), 1, { + convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); + }); + +} + +template // deprecate inline void blockInnerProduct(Lattice &CoarseInner, const Lattice &fineX, const Lattice &fineY) @@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice &CoarseInner, Lattice coarse_inner(coarse); // Precision promotion? - auto CoarseInner_ = CoarseInner.View(); - auto coarse_inner_ = coarse_inner.View(); + auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite); + auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite); fine_inner = localInnerProduct(fineX,fineY); blockSum(coarse_inner,fine_inner); @@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice &CoarseInner, CoarseInner_[ss] = coarse_inner_[ss]; }); } + template inline void blockNormalise(Lattice &ip,Lattice &fineX) { @@ -248,7 +392,7 @@ inline void blockNormalise(Lattice &ip,Lattice &fineX) // useful in multigrid project; // Generic name : Coarsen? template -inline void blockSum(Lattice &coarseData,const Lattice &fineData) +inline void blockSum(Lattice &coarseData,const Lattice &fineData) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); @@ -256,42 +400,41 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) subdivides(coarse,fine); // require they map int _ndimension = coarse->_ndimension; - + Coordinate block_r (_ndimension); - + for(int d=0 ; d<_ndimension;d++){ block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; } int blockVol = fine->oSites()/coarse->oSites(); - // Turn this around to loop threaded over sc and interior loop - // over sf would thread better - auto coarseData_ = coarseData.View(); - auto fineData_ = fineData.View(); + auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite); + auto fineData_ = fineData.AcceleratorView(ViewRead); accelerator_for(sc,coarse->oSites(),1,{ - // One thread per sub block - Coordinate coor_c(_ndimension); - Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + // One thread per sub block + Coordinate coor_c(_ndimension); + Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate + coarseData_[sc]=Zero(); - for(int sb=0;sb_rdimensions); + for(int sb=0;sb_rdimensions); - }); + coarseData_[sc]=coarseData_[sc]+fineData_[sf]; + } + + }); return; } + template inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice &picked,Coordinate coor) { @@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice &unpicked,Lattice -inline void blockOrthogonalise(Lattice &ip,std::vector > &Basis) +template +inline void blockOrthonormalize(Lattice &ip,VLattice &Basis) { GridBase *coarse = ip.Grid(); GridBase *fine = Basis[0].Grid(); @@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice &ip,std::vector > int nbasis = Basis.size() ; // checks - subdivides(coarse,fine); + subdivides(coarse,fine); for(int i=0;i (Basis[v],ip,Basis[u],Basis[v]); + blockZAXPY(Basis[v],ip,Basis[u],Basis[v]); } blockNormalise(ip,Basis[v]); } } +template +inline void blockOrthogonalise(Lattice &ip,std::vector > &Basis) // deprecated inaccurate naming +{ + blockOrthonormalize(ip,Basis); +} + #if 0 +// TODO: CPU optimized version here template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, @@ -383,24 +533,18 @@ inline void blockPromote(const Lattice > &coarseData, } #else -template +template inline void blockPromote(const Lattice > &coarseData, Lattice &fineData, - const std::vector > &Basis) + const VLattice &Basis) { GridBase * fine = fineData.Grid(); GridBase * coarse= coarseData.Grid(); - fineData=Zero(); for(int i=0;i > ip = PeekIndex<0>(coarseData,i); - Lattice cip(coarse); - auto cip_ = cip.View(); - auto ip_ = ip.View(); - accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ - coalescedWrite(cip_[sc], ip_(sc)()); - }); - blockZAXPY(fineData,cip,Basis[i],fineData); + auto ip_ = ip.AcceleratorView(ViewRead); + blockZAXPY(fineData,ip,Basis[i],fineData); } } #endif @@ -470,8 +614,8 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; - auto t_v = To.View(); - auto f_v = From.View(); + auto t_v = To.AcceleratorView(ViewWrite); + auto f_v = From.AcceleratorView(ViewRead); accelerator_for(idx,Fg->lSites(),1,{ sobj s; Coordinate Fcoor(nd); From 87984ece7d135f236c671ab3caeaafd7beb4158f Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 6 May 2020 08:47:18 -0400 Subject: [PATCH 25/51] add Lattice_basis.h --- Grid/lattice/Lattice_basis.h | 236 +++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 Grid/lattice/Lattice_basis.h diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h new file mode 100644 index 00000000..f1126936 --- /dev/null +++ b/Grid/lattice/Lattice_basis.h @@ -0,0 +1,236 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/lattice/Lattice_basis.h + +Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle +Author: Christoph Lehner + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ + /* END LEGAL */ + +#pragma once + +NAMESPACE_BEGIN(Grid); + +template +void basisOrthogonalize(std::vector &basis,Field &w,int k) +{ + // If assume basis[j] are already orthonormal, + // can take all inner products in parallel saving 2x bandwidth + // Save 3x bandwidth on the second line of loop. + // perhaps 2.5x speed up. + // 2x overall in Multigrid Lanczos + for(int j=0; j +void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) +{ + typedef decltype(basis[0]) Field; + typedef decltype(basis[0].View()) View; + auto tmp_v = basis[0].AcceleratorView(ViewReadWrite); + Vector basis_v(basis.size(),tmp_v); + typedef typename std::remove_reference::type vobj; + GridBase* grid = basis[0].Grid(); + + for(int k=0;k B(Nm); // Thread private + thread_for_in_region(ss, grid->oSites(),{ + for(int j=j0; joSites(); + uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead + + Vector Bt(siteBlock * nrot); + auto Bp=&Bt[0]; + + // GPU readable copy of matrix + Vector Qt_jv(Nm*Nm); + double *Qt_p = & Qt_jv[0]; + thread_for(i,Nm*Nm,{ + int j = i/Nm; + int k = i%Nm; + Qt_p[i]=Qt(j,k); + }); + + // Block the loop to keep storage footprint down + for(uint64_t s=0;s +void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) +{ + typedef decltype(basis[0].AcceleratorView()) View; + typedef typename Field::vector_object vobj; + GridBase* grid = basis[0].Grid(); + + result.Checkerboard() = basis[0].Checkerboard(); + auto result_v=result.AcceleratorView(ViewWrite); + Vector basis_v(basis.size(),result_v); + for(int k=0;k Qt_jv(Nm); + double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ + auto B=coalescedRead(zz); + for(int k=k0; k +void basisReorderInPlace(std::vector &_v,std::vector& sort_vals, std::vector& idx) +{ + int vlen = idx.size(); + + assert(vlen>=1); + assert(vlen<=sort_vals.size()); + assert(vlen<=_v.size()); + + for (size_t i=0;ii for which _vnew[j] = _vold[i], + // track the move idx[j] => idx[i] + // track the move idx[i] => i + ////////////////////////////////////// + size_t j; + for (j=i;j i); assert(j!=idx.size()); assert(idx[j]==i); + + swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy + std::swap(sort_vals[i],sort_vals[idx[i]]); + + idx[j] = idx[i]; + idx[i] = i; + } + } +} + +inline std::vector basisSortGetIndex(std::vector& sort_vals) +{ + std::vector idx(sort_vals.size()); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) { + return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]); + }); + return idx; +} + +template +void basisSortInPlace(std::vector & _v,std::vector& sort_vals, bool reverse) +{ + std::vector idx = basisSortGetIndex(sort_vals); + if (reverse) + std::reverse(idx.begin(), idx.end()); + + basisReorderInPlace(_v,sort_vals,idx); +} + +// PAB: faster to compute the inner products first then fuse loops. +// If performance critical can improve. +template +void basisDeflate(const std::vector &_v,const std::vector& eval,const Field& src_orig,Field& result) { + result = Zero(); + assert(_v.size()==eval.size()); + int N = (int)_v.size(); + for (int i=0;i Date: Thu, 7 May 2020 10:03:45 +0100 Subject: [PATCH 26/51] deleted test routines. cleaned up fast version. assert Ns=4,Nc=3. --- Grid/qcd/utils/BaryonUtils.h | 414 ++++------------------------------- 1 file changed, 40 insertions(+), 374 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index a392f223..241395c3 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -46,44 +46,11 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - //static const Complex epsilon_sgn[6]; static const double epsilon_sgn[6]; private: template - static void baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contractions, - robj &result); - template - static void baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - template - static void baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contractions, - robj &result); - template - static inline void baryon_site_template(unsigned int mask, const mobj &D1, + static inline void baryon_site(unsigned int mask, const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -93,7 +60,7 @@ public: const int parity, robj &result); template - static inline void baryon_site_template(const mobj &D1, + static inline void baryon_site(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -208,266 +175,12 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -/*template -const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), - Complex(1), - Complex(1), - Complex(-1), - Complex(-1), - Complex(-1)}; -*/ template const double BaryonUtils::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0}; -//This is the old version -template -template -void BaryonUtils::baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contraction, - robj &result) -{ - - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - - auto gD1a = GammaA_left * GammaA_right * D1; - auto gD1b = GammaA_left * g4 * GammaA_right * D1; - auto pD1 = 0.5* (gD1a + (double)parity * gD1b); - auto gD3 = GammaB_right * D3; - - for (int ie_left=0; ie_left < 6 ; ie_left++){ - int a_left = epsilon[ie_left][0]; //a - int b_left = epsilon[ie_left][1]; //b - int c_left = epsilon[ie_left][2]; //c - for (int ie_right=0; ie_right < 6 ; ie_right++){ - int a_right = epsilon[ie_right][0]; //a' - int b_right = epsilon[ie_right][1]; //b' - int c_right = epsilon[ie_right][2]; //c' - //complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ - auto D2g = D2 * GammaB_left; - for (int alpha_right=0; alpha_right -template -void BaryonUtils::baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result) -{ - - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - - auto gD1a = GammaA_left * GammaA_right * D1; - auto gD1b = GammaA_left * g4 * GammaA_right * D1; - auto pD1 = 0.5* (gD1a + (double)parity * gD1b); - auto gD3 = GammaB_right * D3; - - auto D2g = D2 * GammaB_left; - auto pD1g = pD1 * GammaB_left; - auto gD3g = gD3 * GammaB_left; - - for (int ie_left=0; ie_left < 6 ; ie_left++){ - int a_left = epsilon[ie_left][0]; //a - int b_left = epsilon[ie_left][1]; //b - int c_left = epsilon[ie_left][2]; //c - for (int ie_right=0; ie_right < 6 ; ie_right++){ - int a_right = epsilon[ie_right][0]; //a' - int b_right = epsilon[ie_right][1]; //b' - int c_right = epsilon[ie_right][2]; //c' - double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - //All parts together - for (int gamma_left=0; gamma_left( D1, D2, D3, GA_l, GB_l, GA_r, GB_r, parity, result );\ -} - -template -template -void BaryonUtils::baryon_site_macro(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const int * wick_contraction, - robj &result) -{ -BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 0 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 0 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 0 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 0 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 0 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 0 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); -BARYON_SITE( 1 , 1 , 1 , 1 , 1 , 1 , D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, wick_contraction, result); - -} - - template template -inline void BaryonUtils::baryon_site_template(const mobj &D1, +inline void BaryonUtils::baryon_site(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -556,7 +269,7 @@ inline void BaryonUtils::BaryonSiteHelper::function(const unsign assert(mask <= maxMask); if (mask == maxMask) { - baryon_site_template(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); + baryon_site(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); } else { @@ -567,7 +280,7 @@ inline void BaryonUtils::BaryonSiteHelper::function(const unsign // top-level function template template -inline void BaryonUtils::baryon_site_template(const unsigned int mask, const mobj &D1, +inline void BaryonUtils::baryon_site(const unsigned int mask, const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -594,40 +307,30 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const int parity, ComplexField &baryon_corr) { - const std::chrono::system_clock::time_point start{ std::chrono::system_clock::now() }; - std::time_t now = std::chrono::system_clock::to_time_t( start ); - std::cout << "Setup start " << std::ctime( &now ); + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - GridBase *grid = q1_left.Grid(); + GridBase *grid = q1_left.Grid(); - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; + int wick_id; + for (int ie=0; ie < 6 ; ie++) + wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); - auto vbaryon_corr= baryon_corr.View(); - auto v1 = q1_left.View(); - auto v2 = q2_left.View(); - auto v3 = q3_left.View(); + auto vbaryon_corr= baryon_corr.View(); + auto v1 = q1_left.View(); + auto v2 = q2_left.View(); + auto v3 = q3_left.View(); - const std::chrono::system_clock::time_point stop{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop ); - const std::chrono::duration duration_seconds = stop - start; - const double seconds{ ( duration_seconds.count() ) }; - std::cout << "Setup stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds << " seconds." << std::endl; - - const std::chrono::system_clock::time_point start2{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( start2 ); - std::cout << "Normal Loop start " << std::ctime( &now ); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ //for(int ss=0; ss < grid->oSites(); ss++){ @@ -636,58 +339,9 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites - const std::chrono::system_clock::time_point stop2{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop2 ); - const std::chrono::duration duration_seconds2 = stop2 - start2; - const double seconds2{ ( duration_seconds2.count() ) }; - std::cout << "Normal Loop stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds2 << " seconds." << std::endl; - const std::chrono::system_clock::time_point start4{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( start4 ); - std::cout << "Opt-macro Loop start " << std::ctime( &now ); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ - //for(int ss=0; ss < grid->oSites(); ss++){ - - auto D1 = v1[ss]; - auto D2 = v2[ss]; - auto D3 = v3[ss]; - - vobj result=Zero(); - baryon_site_macro(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); - vbaryon_corr[ss] = result; - } );//end loop over lattice sites - const std::chrono::system_clock::time_point stop4{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop4 ); - const std::chrono::duration duration_seconds4 = stop4 - start4; - const double seconds4{ ( duration_seconds4.count() ) }; - std::cout << "Opt-macro Loop stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds4 << " seconds." << std::endl; - const std::chrono::system_clock::time_point start3{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( start3 ); - int wick_id=32*wick_contraction[0]+16*wick_contraction[1]+8*wick_contraction[2]+4*wick_contraction[3]+2*wick_contraction[4]+wick_contraction[5]; - std::cout << "Opt-template Loop start " << std::ctime( &now ); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ - //for(int ss=0; ss < grid->oSites(); ss++){ - - auto D1 = v1[ss]; - auto D2 = v2[ss]; - auto D3 = v3[ss]; - - vobj result=Zero(); - baryon_site_template(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); - vbaryon_corr[ss] = result; - } );//end loop over lattice sites - const std::chrono::system_clock::time_point stop3{ std::chrono::system_clock::now() }; - now = std::chrono::system_clock::to_time_t( stop3 ); - const std::chrono::duration duration_seconds3 = stop3 - start3; - const double seconds3{ ( duration_seconds3.count() ) }; - std::cout << "Opt-template Loop stop " << std::ctime( &now ) - << "Total duration " << std::fixed << std::setprecision(5) << seconds3 << " seconds." << std::endl; } template @@ -704,20 +358,24 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, const int parity, robj &result) { - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - int wick_contraction[6]; - for (int ie=0; ie < 6 ; ie++) - wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - - result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + int wick_id; + for (int ie=0; ie < 6 ; ie++) + wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); + + result=Zero(); + baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); } /*********************************************************************** @@ -957,6 +615,10 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); @@ -994,6 +656,10 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); From 253bcc3426a212675cc497147c6142c6700102ee Mon Sep 17 00:00:00 2001 From: ferben Date: Thu, 7 May 2020 18:03:17 +0100 Subject: [PATCH 27/51] back to old version --- Grid/qcd/utils/BaryonUtils.h | 249 +++++++++++++++-------------------- 1 file changed, 105 insertions(+), 144 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 241395c3..6cf526c3 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -46,44 +46,20 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - static const double epsilon_sgn[6]; + static const Complex epsilon_sgn[6]; private: template - static inline void baryon_site(unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - template - static inline void baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - - template - struct BaryonSiteHelper - { - template - static inline void function(const unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result); - }; + static void baryon_site(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const int * wick_contractions, + robj &result); public: static void ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -176,11 +152,17 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; template -const double BaryonUtils::epsilon_sgn[6] = {1.0,1.0,1.0,-1.0,-1.0,-1.0}; +const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), + Complex(1), + Complex(1), + Complex(-1), + Complex(-1), + Complex(-1)}; +//This is the old version template -template -inline void BaryonUtils::baryon_site(const mobj &D1, +template +void BaryonUtils::baryon_site(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -188,14 +170,9 @@ inline void BaryonUtils::baryon_site(const mobj &D1, const Gamma GammaA_right, const Gamma GammaB_right, const int parity, + const int * wick_contraction, robj &result) { - constexpr bool wick_contraction_0 = ((mask & (1 << 5)) >> 5); - constexpr bool wick_contraction_1 = ((mask & (1 << 4)) >> 4); - constexpr bool wick_contraction_2 = ((mask & (1 << 3)) >> 3); - constexpr bool wick_contraction_3 = ((mask & (1 << 2)) >> 2); - constexpr bool wick_contraction_4 = ((mask & (1 << 1)) >> 1); - constexpr bool wick_contraction_5 = ((mask & (1 << 0)) >> 0); Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) @@ -216,84 +193,77 @@ inline void BaryonUtils::baryon_site(const mobj &D1, int a_right = epsilon[ie_right][0]; //a' int b_right = epsilon[ie_right][1]; //b' int c_right = epsilon[ie_right][2]; //c' - double ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; - //All parts together + Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + //This is the \delta_{456}^{123} part + if (wick_contraction[0]){ for (int gamma_left=0; gamma_left -template -template -inline void BaryonUtils::BaryonSiteHelper::function(const unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result) -{ - assert(mask <= maxMask); - if (mask == maxMask) - { - baryon_site(D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); - } - else - { - BaryonSiteHelper<(maxMask>0) ? maxMask-1 : 0>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); - } -} - -// top-level function -template -template -inline void BaryonUtils::baryon_site(const unsigned int mask, const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - robj &result) -{ - BaryonSiteHelper<63>::function(mask, D1, D2, D3, GammaA_left, GammaB_left, GammaA_right, GammaB_right, parity, result); -} - - template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, @@ -307,30 +277,30 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const int parity, ComplexField &baryon_corr) { - - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - GridBase *grid = q1_left.Grid(); + GridBase *grid = q1_left.Grid(); - int wick_id; - for (int ie=0; ie < 6 ; ie++) - wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); + int wick_contraction[6]; + for (int ie=0; ie < 6 ; ie++) + wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - auto vbaryon_corr= baryon_corr.View(); - auto v1 = q1_left.View(); - auto v2 = q2_left.View(); - auto v3 = q3_left.View(); + auto vbaryon_corr= baryon_corr.View(); + auto v1 = q1_left.View(); + auto v2 = q2_left.View(); + auto v3 = q3_left.View(); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{ //for(int ss=0; ss < grid->oSites(); ss++){ @@ -339,10 +309,9 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites - } template template @@ -359,23 +328,23 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, robj &result) { - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - int wick_id; - for (int ie=0; ie < 6 ; ie++) - wick_id = ((quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0) << (5-ie); - - result=Zero(); - baryon_site(wick_id,D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,result); + int wick_contraction[6]; + for (int ie=0; ie < 6 ; ie++) + wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; + + result=Zero(); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); } /*********************************************************************** @@ -615,10 +584,6 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, const std::string op, SpinMatrixField &stn_corr) { - - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); @@ -656,10 +621,6 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, const std::string op, SpinMatrixField &stn_corr) { - - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); From 42bb5f0721de5cfdf0d16c6eda66a9fd9f4d13c6 Mon Sep 17 00:00:00 2001 From: ferben Date: Thu, 7 May 2020 18:06:12 +0100 Subject: [PATCH 28/51] asserrtion --- Grid/qcd/utils/BaryonUtils.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 6cf526c3..fa2f3376 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -584,6 +584,10 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); @@ -621,6 +625,10 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, const std::string op, SpinMatrixField &stn_corr) { + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = qs_ti.Grid(); auto vcorr= stn_corr.View(); From 21ca182c368ba2415d874c744b8def244c0b37dd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 May 2020 09:18:24 -0400 Subject: [PATCH 29/51] Comments remove --- Grid/algorithms/LinearOperator.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index 50600d2d..a7fa1a90 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -257,13 +257,11 @@ public: virtual RealD Mpc (const Field &in, Field &out) { Field tmp(in.Grid()); tmp.Checkerboard() = !in.Checkerboard(); - //std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl; _Mat.Meooe(in,tmp); _Mat.MooeeInv(tmp,out); _Mat.Meooe(out,tmp); - //std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl; _Mat.Mooee(in,out); return axpy_norm(out,-1.0,tmp,out); } From 6859a3e1d4cf2487b6d2f7f560f845df4fd5a7af Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 May 2020 09:19:12 -0400 Subject: [PATCH 30/51] Schur operator --- benchmarks/Benchmark_schur.cc | 176 ++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 benchmarks/Benchmark_schur.cc diff --git a/benchmarks/Benchmark_schur.cc b/benchmarks/Benchmark_schur.cc new file mode 100644 index 00000000..afee31b0 --- /dev/null +++ b/benchmarks/Benchmark_schur.cc @@ -0,0 +1,176 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_dwf.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + +void benchDw(std::vector & L, int Ls); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + const int Ls=12; + std::vector< std::vector > latts; +#if 0 + latts.push_back(std::vector ({24,24,24,24}) ); + latts.push_back(std::vector ({48,24,24,24}) ); + latts.push_back(std::vector ({96,24,24,24}) ); + latts.push_back(std::vector ({96,48,24,24}) ); + // latts.push_back(std::vector ({96,48,48,24}) ); + // latts.push_back(std::vector ({96,48,48,48}) ); +#else + // latts.push_back(std::vector ({96,48,48,48}) ); + latts.push_back(std::vector ({96,96,96,192}) ); +#endif + + std::cout << GridLogMessage<< "*****************************************************************" < latt4 = latts[l]; + std::cout << GridLogMessage <<"\t"; + for(int d=0;d & latt4, int Ls) +{ + ///////////////////////////////////////////////////////////////////////////////////// + // for Nc=3 + ///////////////////////////////////////////////////////////////////////////////////// + // Dw : Ls*24*(7+48)= Ls*1320 + // + // M5D: Ls*(4*2*Nc mul + 4*2*Nc madd ) = 3*4*2*Nc*Ls = Ls*72 + // Meo: Ls*24*(7+48) + Ls*72 = Ls*1392 + // + // Mee: 3*Ns*2*Nc*Ls // Chroma 6*N5*Nc*Ns + // + // LeemInv : 2*2*Nc*madd*Ls + // LeeInv : 2*2*Nc*madd*Ls + // DeeInv : 4*2*Nc*mul *Ls + // UeeInv : 2*2*Nc*madd*Ls + // UeemInv : 2*2*Nc*madd*Ls = Nc*Ls*(8+8+8+8+8) = 40*Nc*Ls// Chroma (10*N5 - 8)*Nc*Ns ~ (40 N5 - 32)Nc flops + // QUDA counts as dense LsxLs real matrix x Ls x NcNsNreim => Nc*4*2 x Ls^2 FMA = 16Nc Ls^2 flops + // Mpc => 1452*cbvol*2*Ls flops // + // => (1344+Ls*48)*Ls*cbvol*2 flops QUDA = 1920 @Ls=12 and 2112 @Ls=16 + ///////////////////////////////////////////////////////////////////////////////////// + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + // long unsigned int single_site_flops = 8*Nc*(7+16*Nc)*Ls; + long unsigned int single_site_mpc_flops = 8*Nc*(7+16*Nc)*2*Ls + 40*Nc*2*Ls + 4*Nc*2*Ls; + long unsigned int single_site_quda_flops = 8*Nc*(7+16*Nc)*2*Ls + 16*Nc*Ls*Ls + 4*Nc*2*Ls; + std::vector seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + + ColourMatrixF cm = ComplexF(1.0,0.0); + + int ncall=300; + RealD mass=0.1; + RealD M5 =1.8; + RealD NP = UGrid->_Nprocessors; + double volume=1; for(int mu=0;mu Mpc(Dw); + Chebyshev Cheby(0.0,60.0,order); + + { + Mpc.Mpc(src_o,r_o); + Mpc.Mpc(src_o,r_o); + Mpc.Mpc(src_o,r_o); + + double t0=usecond(); + for(int i=0;i Date: Fri, 8 May 2020 09:19:54 -0400 Subject: [PATCH 31/51] Remove verbose --- Grid/qcd/action/fermion/MobiusFermion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/MobiusFermion.h b/Grid/qcd/action/fermion/MobiusFermion.h index 1cbb6609..1e948092 100644 --- a/Grid/qcd/action/fermion/MobiusFermion.h +++ b/Grid/qcd/action/fermion/MobiusFermion.h @@ -59,7 +59,7 @@ public: { RealD eps = 1.0; - std::cout<Ls);// eps is ignored for higham assert(zdata->n==this->Ls); From 1d65e2f62ccb3555e6fec9413960bd7b8f46b84d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 8 May 2020 09:20:54 -0400 Subject: [PATCH 32/51] Slightly faster Chebyshev; ifdef'ed out the fastest until tested numerics Lifteed from HDCR setup --- Grid/algorithms/approx/Chebyshev.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 133db2b4..c0b0646d 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -234,10 +234,9 @@ public: GridBase *grid=in.Grid(); - // std::cout << "Chevyshef(): in.Grid()="<({45,12,81,9})); - for(int lat=8;lat<=lmax;lat+=4){ + for(int lat=8;lat<=lmax;lat+=8){ Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + GridCartesian Grid(latt_size,simd_layout,mpi_layout); // NP= Grid.RankCount(); @@ -270,191 +265,8 @@ public: } }; -#if 0 - static double DWF5(int Ls,int L) - { - // RealD mass=0.1; - RealD M5 =1.8; - double mflops; - double mflops_best = 0; - double mflops_worst= 0; - std::vector mflops_all; - - /////////////////////////////////////////////////////// - // Set/Get the layout & grid size - /////////////////////////////////////////////////////// - int threads = GridThread::GetThreads(); - Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); - Coordinate local({L,L,L,L}); - - GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({64,64,64,64}), - GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); - uint64_t NP = TmpGrid->RankCount(); - uint64_t NN = TmpGrid->NodeCount(); - NN_global=NN; - uint64_t SHM=NP/NN; - - Coordinate internal; - if ( SHM == 1 ) internal = Coordinate({1,1,1,1}); - else if ( SHM == 2 ) internal = Coordinate({2,1,1,1}); - else if ( SHM == 4 ) internal = Coordinate({2,2,1,1}); - else if ( SHM == 8 ) internal = Coordinate({2,2,2,1}); - else assert(0); - - Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); - Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); - - ///////// Welcome message //////////// - std::cout< seeds4({1,2,3,4}); - std::vector seeds5({5,6,7,8}); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5); - std::cout << GridLogMessage << "Initialised RNGs" << std::endl; - - ///////// Source preparation //////////// - LatticeFermion src (sFGrid); - LatticeFermion tmp (sFGrid); - std::cout << GridLogMessage << "allocated src and tmp" << std::endl; - random(RNG5,src); - std::cout << GridLogMessage << "intialised random source" << std::endl; - - RealD N2 = 1.0/::sqrt(norm2(src)); - src = src*N2; - - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); - - WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); - LatticeFermion src_e (sFrbGrid); - LatticeFermion src_o (sFrbGrid); - LatticeFermion r_e (sFrbGrid); - LatticeFermion r_o (sFrbGrid); - LatticeFermion r_eo (sFGrid); - LatticeFermion err (sFGrid); - { - - pickCheckerboard(Even,src_e,src); - pickCheckerboard(Odd,src_o,src); - -#if defined(AVX512) - const int num_cases = 6; - std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); -#else - const int num_cases = 4; - std::string fmt("U/S ; U/O ; G/S ; G/O "); -#endif - controls Cases [] = { -#ifdef AVX512 - { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, -#endif - { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, - { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } - }; - - for(int c=0;cBarrier(); - for(int i=0;iBarrier(); - double t1=usecond(); - - sDw.ZeroCounters(); - time_statistics timestat; - std::vector t_time(ncall); - for(uint64_t i=0;iBarrier(); - - double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; - if ( mflopsRankCount(); uint64_t NN = TmpGrid->NodeCount(); NN_global=NN; uint64_t SHM=NP/NN; - Coordinate internal; - if ( SHM == 1 ) internal = Coordinate({1,1,1,1}); - else if ( SHM == 2 ) internal = Coordinate({2,1,1,1}); - else if ( SHM == 4 ) internal = Coordinate({2,2,1,1}); - else if ( SHM == 8 ) internal = Coordinate({2,2,2,1}); - else assert(0); - - Coordinate nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); - Coordinate latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout< U(4,FGrid); - auto Umu_v = Umu.View(); - auto Umu5d_v = Umu5d.View(); - for(int ss=0;ssoSites();ss++){ - for(int s=0;s(Umu5d,mu); - } - for(int mu=0;muBarrier(); for(int i=0;iBarrier(); double t1=usecond(); - // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); - // if (ncall < 500) ncall = 500; - uint64_t ncall = 1000; + uint64_t ncall = 50; FGrid->Broadcast(0,&ncall,sizeof(ncall)); @@ -649,24 +406,11 @@ public: std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU3::HotConfiguration(RNG4,Umu); + + typename Action::ImplParams params; + Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 4; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential }, + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops L_list({16,24,32}); int selm1=sel-1; - std::vector robust_list; std::vector wilson; std::vector dwf4; - std::vector dwf5; + std::vector staggered; - if ( do_wilson ) { - int Ls=1; - std::cout< Date: Sat, 9 May 2020 22:27:56 -0400 Subject: [PATCH 39/51] Split allocator cache into two pools of different sizes --- Grid/allocator/AlignedAllocator.cc | 69 +++++++++++++++--------------- Grid/allocator/AlignedAllocator.h | 17 ++++---- 2 files changed, 44 insertions(+), 42 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index d53c4dc2..77646410 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,21 +6,19 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; -#ifdef GRID_NVCC -#define SMALL_LIMIT (0) -#else -#define SMALL_LIMIT (4096) -#endif - -#ifdef POINTER_CACHE -int PointerCache::victim; - +int PointerCache::Victim; +int PointerCache::VictimSmall; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; +PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall]; -void *PointerCache::Insert(void *ptr,size_t bytes) { - - if (bytes < SMALL_LIMIT ) return ptr; - +void *PointerCache::Insert(void *ptr,size_t bytes) +{ + if (bytes < GRID_ALLOC_SMALL_LIMIT ) + return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall); + return Insert(ptr,bytes,Entries,Ncache,Victim); +} +void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) +{ #ifdef GRID_OMP assert(omp_in_parallel()==0); #endif @@ -28,8 +26,8 @@ void *PointerCache::Insert(void *ptr,size_t bytes) { void * ret = NULL; int v = -1; - for(int e=0;e #define POINTER_CACHE #define GRID_ALLOC_ALIGN (2*1024*1024) +#define GRID_ALLOC_SMALL_LIMIT (4096) NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? -#ifdef POINTER_CACHE + class PointerCache { private: /*Pinning pages is costly*/ /*Could maintain separate large and small allocation caches*/ -#ifdef GRID_NVCC - static const int Ncache=128; -#else +/* Could make these configurable, perhaps up to a max size*/ + static const int NcacheSmall=128; static const int Ncache=8; -#endif - static int victim; typedef struct { void *address; @@ -65,14 +63,17 @@ private: } PointerCacheEntry; static PointerCacheEntry Entries[Ncache]; + static int Victim; + static PointerCacheEntry EntriesSmall[NcacheSmall]; + static int VictimSmall; public: static void *Insert(void *ptr,size_t bytes) ; + static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ; static void *Lookup(size_t bytes) ; - + static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ; }; -#endif std::string sizeString(size_t bytes); From 2bb2c68e15572bcc6012bfe2694bdca10948463f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 9 May 2020 22:57:21 -0400 Subject: [PATCH 40/51] Separate pools for small and large allocations cache --- Grid/allocator/AlignedAllocator.cc | 16 ++++++++++++++-- Grid/allocator/AlignedAllocator.h | 12 +++++++----- .../CayleyFermion5DImplementation.h | 3 ++- Grid/util/Init.cc | 2 ++ 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index 77646410..976dfbdc 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -6,11 +6,23 @@ NAMESPACE_BEGIN(Grid); MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; +int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax; +int PointerCache::Ncache = PointerCache::NcacheMax; int PointerCache::Victim; int PointerCache::VictimSmall; -PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; -PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmall]; +PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax]; +PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax]; +void PointerCache::Init(void) +{ + char * str; + str= getenv("GRID_ALLOC_NCACHE_LARGE"); + if ( str ) Ncache = atoi(str); + if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax; + str= getenv("GRID_ALLOC_NCACHE_SMALL"); + if ( str ) NcacheSmall = atoi(str); + if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax; +} void *PointerCache::Insert(void *ptr,size_t bytes) { if (bytes < GRID_ALLOC_SMALL_LIMIT ) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index d6e2e073..77167299 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -53,8 +53,10 @@ private: /*Pinning pages is costly*/ /*Could maintain separate large and small allocation caches*/ /* Could make these configurable, perhaps up to a max size*/ - static const int NcacheSmall=128; - static const int Ncache=8; + static const int NcacheSmallMax=128; + static const int NcacheMax=16; + static int NcacheSmall; + static int Ncache; typedef struct { void *address; @@ -62,13 +64,13 @@ private: int valid; } PointerCacheEntry; - static PointerCacheEntry Entries[Ncache]; + static PointerCacheEntry Entries[NcacheMax]; static int Victim; - static PointerCacheEntry EntriesSmall[NcacheSmall]; + static PointerCacheEntry EntriesSmall[NcacheSmallMax]; static int VictimSmall; public: - + static void Init(void); static void *Insert(void *ptr,size_t bytes) ; static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ; static void *Lookup(size_t bytes) ; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index c80d2425..e379026c 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -779,9 +779,9 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, assert(mu>=0); assert(mu::SeqConservedCurrent(PropagatorField &q_in, #endif #ifndef GRID_NVCC + int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE //////////////////////////////////////////////// diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 570f4234..1b672141 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -355,6 +355,8 @@ void Grid_init(int *argc,char ***argv) ////////////////////////////////////////////////////////// GridGpuInit(); // Must come first to set device prior to MPI init + PointerCache::Init(); + if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){ int MB; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); From ea08f193e7bdd7fcb8d18a8713f0f5387def9b2f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 10 May 2020 05:24:26 -0400 Subject: [PATCH 41/51] Allocator cache spliit into large/small pools --- Grid/algorithms/approx/Chebyshev.h | 2 +- Grid/allocator/AlignedAllocator.cc | 10 +++++++++- Grid/communicator/SharedMemory.cc | 4 +++- benchmarks/Benchmark_schur.cc | 4 ++-- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index c0b0646d..584ed1d5 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -236,7 +236,6 @@ public: int vol=grid->gSites(); typedef typename Field::vector_type vector_type; - constexpr int Nsimd = vector_type::Nsimd(); Field T0(grid); T0 = in; Field T1(grid); @@ -264,6 +263,7 @@ public: auto Tn_v = Tn->View(); auto Tnp_v = Tnp->View(); auto Tnm_v = Tnm->View(); + constexpr int Nsimd = vector_type::Nsimd(); accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc index 976dfbdc..ef6459ed 100644 --- a/Grid/allocator/AlignedAllocator.cc +++ b/Grid/allocator/AlignedAllocator.cc @@ -7,7 +7,11 @@ MemoryStats *MemoryProfiler::stats = nullptr; bool MemoryProfiler::debug = false; int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax; -int PointerCache::Ncache = PointerCache::NcacheMax; +#ifdef GRID_CUDA +int PointerCache::Ncache = 32; +#else +int PointerCache::Ncache = 8; +#endif int PointerCache::Victim; int PointerCache::VictimSmall; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax]; @@ -16,12 +20,16 @@ PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheS void PointerCache::Init(void) { char * str; + str= getenv("GRID_ALLOC_NCACHE_LARGE"); if ( str ) Ncache = atoi(str); if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax; + str= getenv("GRID_ALLOC_NCACHE_SMALL"); if ( str ) NcacheSmall = atoi(str); if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax; + + // printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax); } void *PointerCache::Insert(void *ptr,size_t bytes) { diff --git a/Grid/communicator/SharedMemory.cc b/Grid/communicator/SharedMemory.cc index 5bca9764..de10da3d 100644 --- a/Grid/communicator/SharedMemory.cc +++ b/Grid/communicator/SharedMemory.cc @@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){ if (heap_bytes >= heap_size) { std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm flag" < > latts; -#if 0 +#if 1 latts.push_back(std::vector ({24,24,24,24}) ); latts.push_back(std::vector ({48,24,24,24}) ); latts.push_back(std::vector ({96,24,24,24}) ); @@ -157,7 +157,7 @@ void benchDw(std::vector & latt4, int Ls) std::cout <<"\t"< Date: Tue, 12 May 2020 17:55:53 -0400 Subject: [PATCH 42/51] Remove the norm in MdagM --- Grid/algorithms/CoarsenedMatrix.h | 31 +- Grid/algorithms/LinearOperator.h | 628 +++++++++--------- Grid/algorithms/SparseMatrix.h | 14 +- Grid/qcd/action/fermion/CayleyFermion5D.h | 4 +- .../fermion/ContinuedFractionFermion5D.h | 4 +- .../action/fermion/DomainWallEOFAFermion.h | 4 +- Grid/qcd/action/fermion/FermionOperator.h | 9 +- .../action/fermion/ImprovedStaggeredFermion.h | 4 +- .../fermion/ImprovedStaggeredFermion5D.h | 5 +- Grid/qcd/action/fermion/MobiusEOFAFermion.h | 4 +- .../action/fermion/PartialFractionFermion5D.h | 4 +- Grid/qcd/action/fermion/WilsonCloverFermion.h | 5 +- Grid/qcd/action/fermion/WilsonFermion.h | 4 +- Grid/qcd/action/fermion/WilsonFermion5D.h | 5 +- Grid/qcd/action/fermion/WilsonTMFermion5D.h | 8 +- .../CayleyFermion5DImplementation.h | 6 +- ...ContinuedFractionFermion5DImplementation.h | 7 +- .../DomainWallEOFAFermionImplementation.h | 6 +- ...ImprovedStaggeredFermion5DImplementation.h | 29 +- .../ImprovedStaggeredFermionImplementation.h | 44 +- .../MobiusEOFAFermionImplementation.h | 6 +- .../PartialFractionFermion5DImplementation.h | 6 +- .../WilsonCloverFermionImplementation.h | 6 +- .../WilsonFermionImplementation.h | 52 +- Grid/qcd/modules/Registration.h | 8 +- Grid/qcd/smearing/GaugeConfiguration.h | 2 +- Grid/qcd/utils/CovariantCshift.h | 40 ++ 27 files changed, 469 insertions(+), 476 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index a6b01986..4c26f799 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -541,17 +541,14 @@ public: /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know - RealD M (const CoarseVector &in, CoarseVector &out){ - + void M (const CoarseVector &in, CoarseVector &out) + { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); - // RealD Nin = norm2(in); SimpleCompressor compressor; - double comms_usec = -usecond(); Stencil.HaloExchange(in,compressor); - comms_usec += usecond(); auto in_v = in.View(); auto out_v = out.View(); @@ -565,12 +562,7 @@ public: typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0](0))) calcComplex; - GridStopWatch ArithmeticTimer; int osites=Grid()->oSites(); - // double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint; - // double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex); - double usecs =-usecond(); - // assert(geom.npoint==9); accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { int ss = sss/nbasis; @@ -598,23 +590,9 @@ public: } coalescedWrite(out_v[ss](b),res,lane); }); - usecs +=usecond(); - - double nrm_usec=-usecond(); - RealD Nout= norm2(out); - nrm_usec+=usecond(); - - /* - std::cout << GridLogMessage << "\tNorm " << nrm_usec << " us" <oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); - // if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });} - // accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); }); } } diff --git a/Grid/algorithms/LinearOperator.h b/Grid/algorithms/LinearOperator.h index c41f8eef..1add212c 100644 --- a/Grid/algorithms/LinearOperator.h +++ b/Grid/algorithms/LinearOperator.h @@ -43,7 +43,6 @@ NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////////////////// template class LinearOperatorBase { public: - // Support for coarsening to a multigrid virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base @@ -94,7 +93,10 @@ public: _Mat.Mdag(in,out); } void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ - _Mat.MdagM(in,out,n1,n2); + _Mat.MdagM(in,out); + ComplexD dot = innerProduct(in,out); + n1=real(dot); + n2=norm2(out); } void HermOp(const Field &in, Field &out){ _Mat.MdagM(in,out); @@ -131,17 +133,14 @@ public: assert(0); } void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ - _Mat.MdagM(in,out,n1,n2); - out = out + _shift*in; - - ComplexD dot; - dot= innerProduct(in,out); + HermOp(in,out); + ComplexD dot = innerProduct(in,out); n1=real(dot); n2=norm2(out); } void HermOp(const Field &in, Field &out){ - RealD n1,n2; - HermOpAndNorm(in,out,n1,n2); + _Mat.MdagM(in,out); + out = out + _shift*in; } }; @@ -170,7 +169,7 @@ public: _Mat.M(in,out); } void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ - _Mat.M(in,out); + HermOp(in,out); ComplexD dot= innerProduct(in,out); n1=real(dot); n2=norm2(out); } @@ -208,339 +207,305 @@ public: } }; - ////////////////////////////////////////////////////////// - // Even Odd Schur decomp operators; there are several - // ways to introduce the even odd checkerboarding - ////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////// +// Even Odd Schur decomp operators; there are several +// ways to introduce the even odd checkerboarding +////////////////////////////////////////////////////////// - template - class SchurOperatorBase : public LinearOperatorBase { - public: - virtual RealD Mpc (const Field &in, Field &out) =0; - virtual RealD MpcDag (const Field &in, Field &out) =0; - virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { - Field tmp(in.Grid()); - tmp.Checkerboard() = in.Checkerboard(); - ni=Mpc(in,tmp); - no=MpcDag(tmp,out); - } - virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ - out.Checkerboard() = in.Checkerboard(); - MpcDagMpc(in,out,n1,n2); - } - virtual void HermOp(const Field &in, Field &out){ - RealD n1,n2; - HermOpAndNorm(in,out,n1,n2); - } - void Op (const Field &in, Field &out){ - Mpc(in,out); - } - void AdjOp (const Field &in, Field &out){ - MpcDag(in,out); - } - // Support for coarsening to a multigrid - void OpDiag (const Field &in, Field &out) { - assert(0); // must coarsen the unpreconditioned system - } - void OpDir (const Field &in, Field &out,int dir,int disp) { - assert(0); - } - void OpDirAll (const Field &in, std::vector &out){ - assert(0); - }; - }; - template - class SchurDiagMooeeOperator : public SchurOperatorBase { - public: - Matrix &_Mat; - SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; - virtual RealD Mpc (const Field &in, Field &out) { - Field tmp(in.Grid()); - tmp.Checkerboard() = !in.Checkerboard(); - - _Mat.Meooe(in,tmp); - _Mat.MooeeInv(tmp,out); - _Mat.Meooe(out,tmp); - - _Mat.Mooee(in,out); - return axpy_norm(out,-1.0,tmp,out); - } - virtual RealD MpcDag (const Field &in, Field &out){ - Field tmp(in.Grid()); - - _Mat.MeooeDag(in,tmp); - _Mat.MooeeInvDag(tmp,out); - _Mat.MeooeDag(out,tmp); - - _Mat.MooeeDag(in,out); - return axpy_norm(out,-1.0,tmp,out); - } - }; - template - class SchurDiagOneOperator : public SchurOperatorBase { - protected: - Matrix &_Mat; - public: - SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; - - virtual RealD Mpc (const Field &in, Field &out) { - Field tmp(in.Grid()); - - _Mat.Meooe(in,out); - _Mat.MooeeInv(out,tmp); - _Mat.Meooe(tmp,out); - _Mat.MooeeInv(out,tmp); - - return axpy_norm(out,-1.0,tmp,in); - } - virtual RealD MpcDag (const Field &in, Field &out){ - Field tmp(in.Grid()); - - _Mat.MooeeInvDag(in,out); - _Mat.MeooeDag(out,tmp); - _Mat.MooeeInvDag(tmp,out); - _Mat.MeooeDag(out,tmp); - - return axpy_norm(out,-1.0,tmp,in); - } - }; - template - class SchurDiagTwoOperator : public SchurOperatorBase { - protected: - Matrix &_Mat; - public: - SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; - - virtual RealD Mpc (const Field &in, Field &out) { - Field tmp(in.Grid()); - - _Mat.MooeeInv(in,out); - _Mat.Meooe(out,tmp); - _Mat.MooeeInv(tmp,out); - _Mat.Meooe(out,tmp); - - return axpy_norm(out,-1.0,tmp,in); - } - virtual RealD MpcDag (const Field &in, Field &out){ - Field tmp(in.Grid()); - - _Mat.MeooeDag(in,out); - _Mat.MooeeInvDag(out,tmp); - _Mat.MeooeDag(tmp,out); - _Mat.MooeeInvDag(out,tmp); - - return axpy_norm(out,-1.0,tmp,in); - } - }; - - template - class NonHermitianSchurOperatorBase : public LinearOperatorBase - { - public: - virtual RealD Mpc (const Field& in, Field& out) = 0; - virtual RealD MpcDag (const Field& in, Field& out) = 0; - virtual void MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) { - Field tmp(in.Grid()); - tmp.Checkerboard() = in.Checkerboard(); - ni = Mpc(in,tmp); - no = MpcDag(tmp,out); - } - virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { - assert(0); - } - virtual void HermOp(const Field& in, Field& out) { - assert(0); - } - void Op(const Field& in, Field& out) { - Mpc(in, out); - } - void AdjOp(const Field& in, Field& out) { - MpcDag(in, out); - } - // Support for coarsening to a multigrid - void OpDiag(const Field& in, Field& out) { - assert(0); // must coarsen the unpreconditioned system - } - void OpDir(const Field& in, Field& out, int dir, int disp) { - assert(0); - } - void OpDirAll(const Field& in, std::vector& out){ - assert(0); - }; - }; - - template - class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase - { - public: - Matrix& _Mat; - NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){}; - virtual RealD Mpc(const Field& in, Field& out) { - Field tmp(in.Grid()); - tmp.Checkerboard() = !in.Checkerboard(); - - _Mat.Meooe(in, tmp); - _Mat.MooeeInv(tmp, out); - _Mat.Meooe(out, tmp); - - _Mat.Mooee(in, out); - - return axpy_norm(out, -1.0, tmp, out); - } - virtual RealD MpcDag(const Field& in, Field& out) { - Field tmp(in.Grid()); - - _Mat.MeooeDag(in, tmp); - _Mat.MooeeInvDag(tmp, out); - _Mat.MeooeDag(out, tmp); - - _Mat.MooeeDag(in, out); - - return axpy_norm(out, -1.0, tmp, out); - } - }; - - template - class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase - { - protected: - Matrix &_Mat; - - public: - NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){}; - virtual RealD Mpc(const Field& in, Field& out) { - Field tmp(in.Grid()); - - _Mat.Meooe(in, out); - _Mat.MooeeInv(out, tmp); - _Mat.Meooe(tmp, out); - _Mat.MooeeInv(out, tmp); - - return axpy_norm(out, -1.0, tmp, in); - } - virtual RealD MpcDag(const Field& in, Field& out) { - Field tmp(in.Grid()); - - _Mat.MooeeInvDag(in, out); - _Mat.MeooeDag(out, tmp); - _Mat.MooeeInvDag(tmp, out); - _Mat.MeooeDag(out, tmp); - - return axpy_norm(out, -1.0, tmp, in); - } - }; - - template - class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase - { - protected: - Matrix& _Mat; - - public: - NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){}; - - virtual RealD Mpc(const Field& in, Field& out) { - Field tmp(in.Grid()); - - _Mat.MooeeInv(in, out); - _Mat.Meooe(out, tmp); - _Mat.MooeeInv(tmp, out); - _Mat.Meooe(out, tmp); - - return axpy_norm(out, -1.0, tmp, in); - } - virtual RealD MpcDag(const Field& in, Field& out) { - Field tmp(in.Grid()); - - _Mat.MeooeDag(in, out); - _Mat.MooeeInvDag(out, tmp); - _Mat.MeooeDag(tmp, out); - _Mat.MooeeInvDag(out, tmp); - - return axpy_norm(out, -1.0, tmp, in); - } - }; - - /////////////////////////////////////////////////////////////////////////////////////////////////// - // Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta - // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi - /////////////////////////////////////////////////////////////////////////////////////////////////// - template using SchurDiagOneRH = SchurDiagTwoOperator ; - template using SchurDiagOneLH = SchurDiagOneOperator ; - /////////////////////////////////////////////////////////////////////////////////////////////////// - // Staggered use - /////////////////////////////////////////////////////////////////////////////////////////////////// - template - class SchurStaggeredOperator : public SchurOperatorBase { - protected: - Matrix &_Mat; - Field tmp; - RealD mass; - double tMpc; - double tIP; - double tMeo; - double taxpby_norm; - uint64_t ncall; -public: - void Report(void) - { - std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "< +class SchurOperatorBase : public LinearOperatorBase { + public: + virtual void Mpc (const Field &in, Field &out) =0; + virtual void MpcDag (const Field &in, Field &out) =0; + virtual void MpcDagMpc(const Field &in, Field &out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = in.Checkerboard(); + Mpc(in,tmp); + MpcDag(tmp,out); + } virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ - ncall++; - tMpc-=usecond(); - n2 = Mpc(in,out); - tMpc+=usecond(); - tIP-=usecond(); - ComplexD dot= innerProduct(in,out); - tIP+=usecond(); - n1 = real(dot); + out.Checkerboard() = in.Checkerboard(); + MpcDagMpc(in,out); + ComplexD dot= innerProduct(in,out); + n1=real(dot); + n2=norm2(out); } virtual void HermOp(const Field &in, Field &out){ - ncall++; - tMpc-=usecond(); - _Mat.Meooe(in,out); - _Mat.Meooe(out,tmp); - tMpc+=usecond(); - taxpby_norm-=usecond(); - axpby(out,-1.0,mass*mass,tmp,in); - taxpby_norm+=usecond(); + out.Checkerboard() = in.Checkerboard(); + MpcDagMpc(in,out); } - virtual RealD Mpc (const Field &in, Field &out) - { + void Op (const Field &in, Field &out){ + Mpc(in,out); + } + void AdjOp (const Field &in, Field &out){ + MpcDag(in,out); + } + // Support for coarsening to a multigrid + void OpDiag (const Field &in, Field &out) { + assert(0); // must coarsen the unpreconditioned system + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; +}; +template + class SchurDiagMooeeOperator : public SchurOperatorBase { + public: + Matrix &_Mat; + SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; + virtual void Mpc (const Field &in, Field &out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = !in.Checkerboard(); + + _Mat.Meooe(in,tmp); + _Mat.MooeeInv(tmp,out); + _Mat.Meooe(out,tmp); + _Mat.Mooee(in,out); + axpy(out,-1.0,tmp,out); + } + virtual void MpcDag (const Field &in, Field &out){ + Field tmp(in.Grid()); + + _Mat.MeooeDag(in,tmp); + _Mat.MooeeInvDag(tmp,out); + _Mat.MeooeDag(out,tmp); + _Mat.MooeeDag(in,out); + axpy(out,-1.0,tmp,out); + } +}; +template + class SchurDiagOneOperator : public SchurOperatorBase { + protected: + Matrix &_Mat; + public: + SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; + + virtual void Mpc (const Field &in, Field &out) { + Field tmp(in.Grid()); + _Mat.Meooe(in,out); + _Mat.MooeeInv(out,tmp); + _Mat.Meooe(tmp,out); + _Mat.MooeeInv(out,tmp); + axpy(out,-1.0,tmp,in); + } + virtual void MpcDag (const Field &in, Field &out){ + Field tmp(in.Grid()); + + _Mat.MooeeInvDag(in,out); + _Mat.MeooeDag(out,tmp); + _Mat.MooeeInvDag(tmp,out); + _Mat.MeooeDag(out,tmp); + axpy(out,-1.0,tmp,in); + } +}; +template + class SchurDiagTwoOperator : public SchurOperatorBase { + protected: + Matrix &_Mat; + public: + SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; + + virtual void Mpc (const Field &in, Field &out) { + Field tmp(in.Grid()); + + _Mat.MooeeInv(in,out); + _Mat.Meooe(out,tmp); + _Mat.MooeeInv(tmp,out); + _Mat.Meooe(out,tmp); + + axpy(out,-1.0,tmp,in); + } + virtual void MpcDag (const Field &in, Field &out){ + Field tmp(in.Grid()); + + _Mat.MeooeDag(in,out); + _Mat.MooeeInvDag(out,tmp); + _Mat.MeooeDag(tmp,out); + _Mat.MooeeInvDag(out,tmp); + + axpy(out,-1.0,tmp,in); + } +}; + +template +class NonHermitianSchurOperatorBase : public LinearOperatorBase +{ + public: + virtual void Mpc (const Field& in, Field& out) = 0; + virtual void MpcDag (const Field& in, Field& out) = 0; + virtual void MpcDagMpc(const Field& in, Field& out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = in.Checkerboard(); + Mpc(in,tmp); + MpcDag(tmp,out); + } + virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { + assert(0); + } + virtual void HermOp(const Field& in, Field& out) { + assert(0); + } + void Op(const Field& in, Field& out) { + Mpc(in, out); + } + void AdjOp(const Field& in, Field& out) { + MpcDag(in, out); + } + // Support for coarsening to a multigrid + void OpDiag(const Field& in, Field& out) { + assert(0); // must coarsen the unpreconditioned system + } + void OpDir(const Field& in, Field& out, int dir, int disp) { + assert(0); + } + void OpDirAll(const Field& in, std::vector& out){ + assert(0); + }; +}; + +template +class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase +{ + public: + Matrix& _Mat; + NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){}; + virtual void Mpc(const Field& in, Field& out) { + Field tmp(in.Grid()); + tmp.Checkerboard() = !in.Checkerboard(); + + _Mat.Meooe(in, tmp); + _Mat.MooeeInv(tmp, out); + _Mat.Meooe(out, tmp); + + _Mat.Mooee(in, out); + + axpy(out, -1.0, tmp, out); + } + virtual void MpcDag(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MeooeDag(in, tmp); + _Mat.MooeeInvDag(tmp, out); + _Mat.MeooeDag(out, tmp); + + _Mat.MooeeDag(in, out); + + axpy(out, -1.0, tmp, out); + } +}; + +template +class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase +{ + protected: + Matrix &_Mat; + + public: + NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){}; + virtual void Mpc(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.Meooe(in, out); + _Mat.MooeeInv(out, tmp); + _Mat.Meooe(tmp, out); + _Mat.MooeeInv(out, tmp); + + axpy(out, -1.0, tmp, in); + } + virtual void MpcDag(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MooeeInvDag(in, out); + _Mat.MeooeDag(out, tmp); + _Mat.MooeeInvDag(tmp, out); + _Mat.MeooeDag(out, tmp); + + axpy(out, -1.0, tmp, in); + } +}; + +template +class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase +{ + protected: + Matrix& _Mat; + + public: + NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){}; + + virtual void Mpc(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MooeeInv(in, out); + _Mat.Meooe(out, tmp); + _Mat.MooeeInv(tmp, out); + _Mat.Meooe(out, tmp); + + axpy(out, -1.0, tmp, in); + } + virtual void MpcDag(const Field& in, Field& out) { + Field tmp(in.Grid()); + + _Mat.MeooeDag(in, out); + _Mat.MooeeInvDag(out, tmp); + _Mat.MeooeDag(tmp, out); + _Mat.MooeeInvDag(out, tmp); + + axpy(out, -1.0, tmp, in); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta +// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi +/////////////////////////////////////////////////////////////////////////////////////////////////// +template using SchurDiagOneRH = SchurDiagTwoOperator ; +template using SchurDiagOneLH = SchurDiagOneOperator ; +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Staggered use +/////////////////////////////////////////////////////////////////////////////////////////////////// +template +class SchurStaggeredOperator : public SchurOperatorBase { + protected: + Matrix &_Mat; + Field tmp; + RealD mass; + public: + SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) + { + assert( _Mat.isTrivialEE() ); + mass = _Mat.Mass(); + } + virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + Mpc(in,out); + ComplexD dot= innerProduct(in,out); + n1 = real(dot); + n2 =0.0; + } + virtual void HermOp(const Field &in, Field &out){ + Mpc(in,out); + // _Mat.Meooe(in,out); + // _Mat.Meooe(out,tmp); + // axpby(out,-1.0,mass*mass,tmp,in); + } + virtual void Mpc (const Field &in, Field &out) + { Field tmp(in.Grid()); Field tmp2(in.Grid()); + + // _Mat.Mooee(in,out); + // _Mat.Mooee(out,tmp); - // std::cout << GridLogIterative << " HermOp.Mpc "< using SchurStagOperator = SchurStaggeredOperator; - ///////////////////////////////////////////////////////////// // Base classes for functions of operators ///////////////////////////////////////////////////////////// diff --git a/Grid/algorithms/SparseMatrix.h b/Grid/algorithms/SparseMatrix.h index b959f53c..8a265b3f 100644 --- a/Grid/algorithms/SparseMatrix.h +++ b/Grid/algorithms/SparseMatrix.h @@ -38,16 +38,12 @@ template class SparseMatrixBase { public: virtual GridBase *Grid(void) =0; // Full checkerboar operations - virtual RealD M (const Field &in, Field &out)=0; - virtual RealD Mdag (const Field &in, Field &out)=0; - virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) { - Field tmp (in.Grid()); - ni=M(in,tmp); - no=Mdag(tmp,out); - } + virtual void M (const Field &in, Field &out)=0; + virtual void Mdag (const Field &in, Field &out)=0; virtual void MdagM(const Field &in, Field &out) { - RealD ni, no; - MdagM(in,out,ni,no); + Field tmp (in.Grid()); + M(in,tmp); + Mdag(tmp,out); } virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index f27f4c23..c7d68d73 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -40,8 +40,8 @@ public: public: // override multiply - virtual RealD M (const FermionField &in, FermionField &out); - virtual RealD Mdag (const FermionField &in, FermionField &out); + virtual void M (const FermionField &in, FermionField &out); + virtual void Mdag (const FermionField &in, FermionField &out); // half checkerboard operations virtual void Meooe (const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 5aa7bfbd..2300afd3 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -41,8 +41,8 @@ public: public: // override multiply - virtual RealD M (const FermionField &in, FermionField &out); - virtual RealD Mdag (const FermionField &in, FermionField &out); + virtual void M (const FermionField &in, FermionField &out); + virtual void Mdag (const FermionField &in, FermionField &out); // half checkerboard operaions virtual void Meooe (const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h index a2d0e733..bcc97176 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h @@ -53,8 +53,8 @@ public: virtual void DtildeInv (const FermionField& in, FermionField& out); // override multiply - virtual RealD M (const FermionField& in, FermionField& out); - virtual RealD Mdag (const FermionField& in, FermionField& out); + virtual void M (const FermionField& in, FermionField& out); + virtual void Mdag (const FermionField& in, FermionField& out); // half checkerboard operations virtual void Mooee (const FermionField& in, FermionField& out); diff --git a/Grid/qcd/action/fermion/FermionOperator.h b/Grid/qcd/action/fermion/FermionOperator.h index f0c2a039..570e350d 100644 --- a/Grid/qcd/action/fermion/FermionOperator.h +++ b/Grid/qcd/action/fermion/FermionOperator.h @@ -58,8 +58,8 @@ public: virtual GridBase *GaugeRedBlackGrid(void) =0; // override multiply - virtual RealD M (const FermionField &in, FermionField &out)=0; - virtual RealD Mdag (const FermionField &in, FermionField &out)=0; + virtual void M (const FermionField &in, FermionField &out)=0; + virtual void Mdag (const FermionField &in, FermionField &out)=0; // half checkerboard operaions virtual void Meooe (const FermionField &in, FermionField &out)=0; @@ -86,15 +86,14 @@ public: virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0; virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0; - virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MdirAll(const FermionField &in, std::vector &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac - virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { assert(0);}; + virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector twist) { assert(0);}; - virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector boundary,std::vector twist) + virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector boundary,std::vector twist) { FFT theFFT((GridCartesian *) in.Grid()); diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index 0cfae7b6..ecf44ed7 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -71,8 +71,8 @@ public: // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent ////////////////////////////////////////////////////////////////// - RealD M(const FermionField &in, FermionField &out); - RealD Mdag(const FermionField &in, FermionField &out); + void M(const FermionField &in, FermionField &out); + void Mdag(const FermionField &in, FermionField &out); ///////////////////////////////////////////////////////// // half checkerboard operations diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 0ce1c701..d1bb0e9c 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -74,8 +73,8 @@ public: GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} // full checkerboard operations; leave unimplemented as abstract for now - RealD M (const FermionField &in, FermionField &out); - RealD Mdag (const FermionField &in, FermionField &out); + void M (const FermionField &in, FermionField &out); + void Mdag (const FermionField &in, FermionField &out); // half checkerboard operations void Meooe (const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermion.h b/Grid/qcd/action/fermion/MobiusEOFAFermion.h index 6b214233..6e4f79eb 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h +++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h @@ -56,8 +56,8 @@ public: virtual void DtildeInv (const FermionField& in, FermionField& out); // override multiply - virtual RealD M (const FermionField& in, FermionField& out); - virtual RealD Mdag (const FermionField& in, FermionField& out); + virtual void M (const FermionField& in, FermionField& out); + virtual void Mdag (const FermionField& in, FermionField& out); // half checkerboard operations virtual void Mooee (const FermionField& in, FermionField& out); diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index 928abd3f..54f8547f 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -47,8 +47,8 @@ public: void M_internal(const FermionField &in, FermionField &out,int dag); // override multiply - virtual RealD M (const FermionField &in, FermionField &out); - virtual RealD Mdag (const FermionField &in, FermionField &out); + virtual void M (const FermionField &in, FermionField &out); + virtual void Mdag (const FermionField &in, FermionField &out); // half checkerboard operaions virtual void Meooe (const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 3847b0d9..4b25d00e 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -109,9 +109,8 @@ public: ImportGauge(_Umu); } - virtual RealD M(const FermionField &in, FermionField &out); - virtual RealD Mdag(const FermionField &in, FermionField &out); - + virtual void M(const FermionField &in, FermionField &out); + virtual void Mdag(const FermionField &in, FermionField &out); virtual void Mooee(const FermionField &in, FermionField &out); virtual void MooeeDag(const FermionField &in, FermionField &out); virtual void MooeeInv(const FermionField &in, FermionField &out); diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index 2e0bc9bf..1c4dd3cf 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -78,8 +78,8 @@ public: // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent ////////////////////////////////////////////////////////////////// - virtual RealD M(const FermionField &in, FermionField &out); - virtual RealD Mdag(const FermionField &in, FermionField &out); + virtual void M(const FermionField &in, FermionField &out); + virtual void Mdag(const FermionField &in, FermionField &out); ///////////////////////////////////////////////////////// // half checkerboard operations diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index ea71376c..804b1d10 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -99,8 +98,8 @@ public: GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} // full checkerboard operations; leave unimplemented as abstract for now - virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;}; - virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;}; + virtual void M (const FermionField &in, FermionField &out){assert(0);}; + virtual void Mdag (const FermionField &in, FermionField &out){assert(0);}; // half checkerboard operations; leave unimplemented as abstract for now virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; diff --git a/Grid/qcd/action/fermion/WilsonTMFermion5D.h b/Grid/qcd/action/fermion/WilsonTMFermion5D.h index 71acf763..982e722a 100644 --- a/Grid/qcd/action/fermion/WilsonTMFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonTMFermion5D.h @@ -120,7 +120,8 @@ class WilsonTMFermion5D : public WilsonFermion5D } } - virtual RealD M(const FermionField &in, FermionField &out) { + virtual void M(const FermionField &in, FermionField &out) + { out.Checkerboard() = in.Checkerboard(); this->Dhop(in, out, DaggerNo); FermionField tmp(out.Grid()); @@ -129,11 +130,12 @@ class WilsonTMFermion5D : public WilsonFermion5D ComplexD b(0.0,this->mu[s]); axpbg5y_ssp(tmp,a,in,b,in,s,s); } - return axpy_norm(out, 1.0, tmp, out); + axpy(out, 1.0, tmp, out); } // needed for fast PV - void update(const std::vector& _mass, const std::vector& _mu) { + void update(const std::vector& _mass, const std::vector& _mu) + { assert(_mass.size() == _mu.size()); assert(_mass.size() == this->FermionGrid()->_fdimensions[0]); this->mass = _mass; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index e379026c..e9675b36 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -323,7 +323,7 @@ void CayleyFermion5D::MeooeDag5D (const FermionField &psi, FermionField } template -RealD CayleyFermion5D::M (const FermionField &psi, FermionField &chi) +void CayleyFermion5D::M (const FermionField &psi, FermionField &chi) { FermionField Din(psi.Grid()); @@ -335,11 +335,10 @@ RealD CayleyFermion5D::M (const FermionField &psi, FermionField &chi) axpby(chi,1.0,1.0,chi,psi); M5D(psi,chi); - return(norm2(chi)); } template -RealD CayleyFermion5D::Mdag (const FermionField &psi, FermionField &chi) +void CayleyFermion5D::Mdag (const FermionField &psi, FermionField &chi) { // Under adjoint //D1+ D1- P- -> D1+^dag P+ D2-^dag @@ -354,7 +353,6 @@ RealD CayleyFermion5D::Mdag (const FermionField &psi, FermionField &chi) M5Ddag(psi,chi); // ((b D_W + D_w hop terms +1) on s-diag axpby (chi,1.0,1.0,chi,psi); - return norm2(chi); } // half checkerboard operations diff --git a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h index beeb3e00..6687800e 100644 --- a/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ContinuedFractionFermion5DImplementation.h @@ -94,7 +94,7 @@ void ContinuedFractionFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Ap template -RealD ContinuedFractionFermion5D::M (const FermionField &psi, FermionField &chi) +void ContinuedFractionFermion5D::M (const FermionField &psi, FermionField &chi) { int Ls = this->Ls; @@ -116,15 +116,14 @@ RealD ContinuedFractionFermion5D::M (const FermionField &psi, F } sign=-sign; } - return norm2(chi); } template -RealD ContinuedFractionFermion5D::Mdag (const FermionField &psi, FermionField &chi) +void ContinuedFractionFermion5D::Mdag (const FermionField &psi, FermionField &chi) { // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag // The rest of matrix is symmetric. // Can ignore "dag" - return M(psi,chi); + M(psi,chi); } template void ContinuedFractionFermion5D::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h index 3684fd6c..64ee4033 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h @@ -89,7 +89,7 @@ void DomainWallEOFAFermion::DtildeInv(const FermionField& psi, FermionFiel /*****************************************************************************************************/ template -RealD DomainWallEOFAFermion::M(const FermionField& psi, FermionField& chi) +void DomainWallEOFAFermion::M(const FermionField& psi, FermionField& chi) { FermionField Din(psi.Grid()); @@ -97,11 +97,10 @@ RealD DomainWallEOFAFermion::M(const FermionField& psi, FermionField& chi) this->DW(Din, chi, DaggerNo); axpby(chi, 1.0, 1.0, chi, psi); this->M5D(psi, chi); - return(norm2(chi)); } template -RealD DomainWallEOFAFermion::Mdag(const FermionField& psi, FermionField& chi) +void DomainWallEOFAFermion::Mdag(const FermionField& psi, FermionField& chi) { FermionField Din(psi.Grid()); @@ -109,7 +108,6 @@ RealD DomainWallEOFAFermion::Mdag(const FermionField& psi, FermionField& c this->MeooeDag5D(Din, chi); this->M5Ddag(psi, chi); axpby(chi, 1.0, 1.0, chi, psi); - return(norm2(chi)); } /******************************************************************** diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index 23692d49..44a201c1 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -548,21 +548,24 @@ void ImprovedStaggeredFermion5D::MdirAll(const FermionField &in, std::vect assert(0); } template -RealD ImprovedStaggeredFermion5D::M(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion5D::M(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); - return axpy_norm(out, mass, in, out); + axpy(out, mass, in, out); } template -RealD ImprovedStaggeredFermion5D::Mdag(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion5D::Mdag(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerYes); - return axpy_norm(out, mass, in, out); + axpy(out, mass, in, out); } template -void ImprovedStaggeredFermion5D::Meooe(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion5D::Meooe(const FermionField &in, FermionField &out) +{ if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerNo); } else { @@ -570,7 +573,8 @@ void ImprovedStaggeredFermion5D::Meooe(const FermionField &in, FermionFiel } } template -void ImprovedStaggeredFermion5D::MeooeDag(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion5D::MeooeDag(const FermionField &in, FermionField &out) +{ if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerYes); } else { @@ -579,27 +583,30 @@ void ImprovedStaggeredFermion5D::MeooeDag(const FermionField &in, FermionF } template -void ImprovedStaggeredFermion5D::Mooee(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion5D::Mooee(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); typename FermionField::scalar_type scal(mass); out = scal * in; } template -void ImprovedStaggeredFermion5D::MooeeDag(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion5D::MooeeDag(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Mooee(in, out); } template -void ImprovedStaggeredFermion5D::MooeeInv(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion5D::MooeeInv(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); out = (1.0 / (mass)) * in; } template -void ImprovedStaggeredFermion5D::MooeeInvDag(const FermionField &in, - FermionField &out) { +void ImprovedStaggeredFermion5D::MooeeInvDag(const FermionField &in,FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); MooeeInv(in, out); } diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 37675da0..57f4cb89 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -171,21 +171,24 @@ void ImprovedStaggeredFermion::ImportGauge(const GaugeField &_Uthin,const ///////////////////////////// template -RealD ImprovedStaggeredFermion::M(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion::M(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); - return axpy_norm(out, mass, in, out); + axpy(out, mass, in, out); } template -RealD ImprovedStaggeredFermion::Mdag(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion::Mdag(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerYes); - return axpy_norm(out, mass, in, out); + axpy(out, mass, in, out); } template -void ImprovedStaggeredFermion::Meooe(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion::Meooe(const FermionField &in, FermionField &out) +{ if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerNo); } else { @@ -193,7 +196,8 @@ void ImprovedStaggeredFermion::Meooe(const FermionField &in, FermionField } } template -void ImprovedStaggeredFermion::MeooeDag(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion::MeooeDag(const FermionField &in, FermionField &out) +{ if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerYes); } else { @@ -202,27 +206,30 @@ void ImprovedStaggeredFermion::MeooeDag(const FermionField &in, FermionFie } template -void ImprovedStaggeredFermion::Mooee(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion::Mooee(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); typename FermionField::scalar_type scal(mass); out = scal * in; } template -void ImprovedStaggeredFermion::MooeeDag(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion::MooeeDag(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Mooee(in, out); } template -void ImprovedStaggeredFermion::MooeeInv(const FermionField &in, FermionField &out) { +void ImprovedStaggeredFermion::MooeeInv(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); out = (1.0 / (mass)) * in; } template -void ImprovedStaggeredFermion::MooeeInvDag(const FermionField &in, - FermionField &out) { +void ImprovedStaggeredFermion::MooeeInvDag(const FermionField &in,FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); MooeeInv(in, out); } @@ -234,7 +241,8 @@ void ImprovedStaggeredFermion::MooeeInvDag(const FermionField &in, template void ImprovedStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, GaugeField & mat, - const FermionField &A, const FermionField &B, int dag) { + const FermionField &A, const FermionField &B, int dag) +{ assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor; @@ -284,8 +292,8 @@ void ImprovedStaggeredFermion::DerivInternal(StencilImpl &st, DoubledGauge } template -void ImprovedStaggeredFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { - +void ImprovedStaggeredFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +{ conformable(U.Grid(), _grid); conformable(U.Grid(), V.Grid()); conformable(U.Grid(), mat.Grid()); @@ -296,8 +304,8 @@ void ImprovedStaggeredFermion::DhopDeriv(GaugeField &mat, const FermionFie } template -void ImprovedStaggeredFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { - +void ImprovedStaggeredFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +{ conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); conformable(U.Grid(), mat.Grid()); @@ -310,8 +318,8 @@ void ImprovedStaggeredFermion::DhopDerivOE(GaugeField &mat, const FermionF } template -void ImprovedStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { - +void ImprovedStaggeredFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +{ conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); conformable(U.Grid(), mat.Grid()); diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h index 256423e6..9b9db178 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h @@ -166,7 +166,7 @@ void MobiusEOFAFermion::DtildeInv(const FermionField& psi, FermionField& c /*****************************************************************************************************/ template -RealD MobiusEOFAFermion::M(const FermionField& psi, FermionField& chi) +void MobiusEOFAFermion::M(const FermionField& psi, FermionField& chi) { FermionField Din(psi.Grid()); @@ -174,11 +174,10 @@ RealD MobiusEOFAFermion::M(const FermionField& psi, FermionField& chi) this->DW(Din, chi, DaggerNo); axpby(chi, 1.0, 1.0, chi, psi); this->M5D(psi, chi); - return(norm2(chi)); } template -RealD MobiusEOFAFermion::Mdag(const FermionField& psi, FermionField& chi) +void MobiusEOFAFermion::Mdag(const FermionField& psi, FermionField& chi) { FermionField Din(psi.Grid()); @@ -186,7 +185,6 @@ RealD MobiusEOFAFermion::Mdag(const FermionField& psi, FermionField& chi) this->MeooeDag5D(Din, chi); this->M5Ddag(psi, chi); axpby(chi, 1.0, 1.0, chi, psi); - return(norm2(chi)); } /******************************************************************** diff --git a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h index edc674cc..0206828b 100644 --- a/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/PartialFractionFermion5DImplementation.h @@ -269,16 +269,14 @@ void PartialFractionFermion5D::M_internal(const FermionField &psi, Fermi } template -RealD PartialFractionFermion5D::M (const FermionField &in, FermionField &out) +void PartialFractionFermion5D::M (const FermionField &in, FermionField &out) { M_internal(in,out,DaggerNo); - return norm2(out); } template -RealD PartialFractionFermion5D::Mdag (const FermionField &in, FermionField &out) +void PartialFractionFermion5D::Mdag (const FermionField &in, FermionField &out) { M_internal(in,out,DaggerYes); - return norm2(out); } template diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 9d99d9e7..36447153 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -35,7 +35,7 @@ NAMESPACE_BEGIN(Grid); // *NOT* EO template -RealD WilsonCloverFermion::M(const FermionField &in, FermionField &out) +void WilsonCloverFermion::M(const FermionField &in, FermionField &out) { FermionField temp(out.Grid()); @@ -47,11 +47,10 @@ RealD WilsonCloverFermion::M(const FermionField &in, FermionField &out) Mooee(in, temp); out += temp; - return norm2(out); } template -RealD WilsonCloverFermion::Mdag(const FermionField &in, FermionField &out) +void WilsonCloverFermion::Mdag(const FermionField &in, FermionField &out) { FermionField temp(out.Grid()); @@ -63,7 +62,6 @@ RealD WilsonCloverFermion::Mdag(const FermionField &in, FermionField &out) MooeeDag(in, temp); out += temp; - return norm2(out); } template diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index be05fcf8..5267e0c1 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -102,21 +102,24 @@ void WilsonFermion::ImportGauge(const GaugeField &_Umu) ///////////////////////////// template -RealD WilsonFermion::M(const FermionField &in, FermionField &out) { +void WilsonFermion::M(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerNo); - return axpy_norm(out, diag_mass, in, out); + axpy(out, diag_mass, in, out); } template -RealD WilsonFermion::Mdag(const FermionField &in, FermionField &out) { +void WilsonFermion::Mdag(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Dhop(in, out, DaggerYes); - return axpy_norm(out, diag_mass, in, out); + axpy(out, diag_mass, in, out); } template -void WilsonFermion::Meooe(const FermionField &in, FermionField &out) { +void WilsonFermion::Meooe(const FermionField &in, FermionField &out) +{ if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerNo); } else { @@ -125,7 +128,8 @@ void WilsonFermion::Meooe(const FermionField &in, FermionField &out) { } template -void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) { +void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) +{ if (in.Checkerboard() == Odd) { DhopEO(in, out, DaggerYes); } else { @@ -134,26 +138,30 @@ void WilsonFermion::MeooeDag(const FermionField &in, FermionField &out) { } template -void WilsonFermion::Mooee(const FermionField &in, FermionField &out) { +void WilsonFermion::Mooee(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); typename FermionField::scalar_type scal(diag_mass); out = scal * in; } template -void WilsonFermion::MooeeDag(const FermionField &in, FermionField &out) { +void WilsonFermion::MooeeDag(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); Mooee(in, out); } template -void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) { +void WilsonFermion::MooeeInv(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); out = (1.0/(diag_mass))*in; } template -void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) { +void WilsonFermion::MooeeInvDag(const FermionField &in, FermionField &out) +{ out.Checkerboard() = in.Checkerboard(); MooeeInv(in,out); } @@ -249,7 +257,8 @@ void WilsonFermion::DerivInternal(StencilImpl &st, DoubledGaugeField &U, } template -void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { +void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +{ conformable(U.Grid(), _grid); conformable(U.Grid(), V.Grid()); conformable(U.Grid(), mat.Grid()); @@ -260,7 +269,8 @@ void WilsonFermion::DhopDeriv(GaugeField &mat, const FermionField &U, cons } template -void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { +void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +{ conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); //conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido) @@ -274,7 +284,8 @@ void WilsonFermion::DhopDerivOE(GaugeField &mat, const FermionField &U, co } template -void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { +void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) +{ conformable(U.Grid(), _cbgrid); conformable(U.Grid(), V.Grid()); //conformable(U.Grid(), mat.Grid()); @@ -287,7 +298,8 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co } template -void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { +void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) +{ conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -297,7 +309,8 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da } template -void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { +void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) +{ conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -308,7 +321,8 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int } template -void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { +void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) +{ conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -386,7 +400,8 @@ template void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); @@ -436,7 +451,8 @@ template void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, - FermionField &out, int dag) { + FermionField &out, int dag) +{ assert((dag == DaggerNo) || (dag == DaggerYes)); Compressor compressor(dag); st.HaloExchange(in, compressor); diff --git a/Grid/qcd/modules/Registration.h b/Grid/qcd/modules/Registration.h index ec28f020..459e1d0b 100644 --- a/Grid/qcd/modules/Registration.h +++ b/Grid/qcd/modules/Registration.h @@ -80,10 +80,10 @@ static Registrar, static Registrar< ConjugateGradientModule, HMC_SolverModuleFactory > __CGWFmodXMLInit("ConjugateGradient"); -static Registrar< BiCGSTABModule, - HMC_SolverModuleFactory > __CGWFmodXMLInit("BiCGSTAB"); -static Registrar< ConjugateResidualModule, - HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); +//static Registrar< BiCGSTABModule, +// HMC_SolverModuleFactory > __CGWFmodXMLInit("BiCGSTAB"); +//static Registrar< ConjugateResidualModule, +// HMC_SolverModuleFactory > __CRWFmodXMLInit("ConjugateResidual"); // add the staggered, scalar versions here diff --git a/Grid/qcd/smearing/GaugeConfiguration.h b/Grid/qcd/smearing/GaugeConfiguration.h index f4d00c72..6f2ff2e7 100644 --- a/Grid/qcd/smearing/GaugeConfiguration.h +++ b/Grid/qcd/smearing/GaugeConfiguration.h @@ -49,7 +49,7 @@ public: private: const unsigned int smearingLevels; - Smear_Stout StoutSmearing; + Smear_Stout &StoutSmearing; std::vector SmearedSet; // Member functions diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index ed96f3bf..6ac69150 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -52,6 +52,26 @@ namespace PeriodicBC { tmp = adj(Link)*field; return Cshift(tmp,mu,-1);// moves towards positive mu } + + template auto + CovShiftForward(const Lattice &Link, + int mu, + const LatticeUnaryExpression &expr) + -> Lattice + { + Lattice arg(expr); + return CovShiftForward(Link,mu,arg); + } + template auto + CovShiftBackward(const Lattice &Link, + int mu, + const LatticeUnaryExpression &expr) + -> Lattice + { + Lattice arg(expr); + return CovShiftForward(Link,mu,arg); + } + } @@ -122,6 +142,26 @@ namespace ConjugateBC { return Cshift(tmp,mu,-1);// moves towards positive mu } + template auto + CovShiftForward(const Lattice &Link, + int mu, + const LatticeUnaryExpression &expr) + -> Lattice + { + Lattice arg(expr); + return CovShiftForward(Link,mu,arg); + } + template auto + CovShiftBackward(const Lattice &Link, + int mu, + const LatticeUnaryExpression &expr) + -> Lattice + { + Lattice arg(expr); + return CovShiftForward(Link,mu,arg); + } + + } From cb7ee375628d00ca387bb9eba13ead6cf4cc6c73 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 May 2020 17:56:40 -0400 Subject: [PATCH 43/51] Close expressions in arg to cshift --- Grid/cshift/Cshift.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Grid/cshift/Cshift.h b/Grid/cshift/Cshift.h index 07ec8412..9150579c 100644 --- a/Grid/cshift/Cshift.h +++ b/Grid/cshift/Cshift.h @@ -49,4 +49,29 @@ Author: Peter Boyle #ifdef GRID_COMMS_SHMEM #include // uses same implementation of communicator #endif + +NAMESPACE_BEGIN(Grid); + +template +auto Cshift(const LatticeUnaryExpression &expr,int dim,int shift) + -> Lattice +{ + return Cshift(closure(expr),dim,shift); +} +template +auto Cshift(const LatticeBinaryExpression &expr,int dim,int shift) + -> Lattice +{ + return Cshift(closure(expr),dim,shift); +} +template +auto Cshift(const LatticeTrinaryExpression &expr,int dim,int shift) + -> Lattice +{ + return Cshift(closure(expr),dim,shift); +} +NAMESPACE_END(Grid); + #endif From 0e3c49f687c7fdc8398b9c2859053265ed1da121 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 May 2020 17:57:01 -0400 Subject: [PATCH 44/51] TransposeIndex was broken by Christoph --- Grid/lattice/Lattice.h | 2 +- Grid/lattice/Lattice_transpose.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 13c0a885..036633b4 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -31,7 +31,7 @@ Author: Peter Boyle #include #include #include -//#include +#include #include #include #include diff --git a/Grid/lattice/Lattice_transpose.h b/Grid/lattice/Lattice_transpose.h index 9b0b3483..6fe08c10 100644 --- a/Grid/lattice/Lattice_transpose.h +++ b/Grid/lattice/Lattice_transpose.h @@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid); //////////////////////////////////////////////////////////////////////////////////////////////////// // Transpose //////////////////////////////////////////////////////////////////////////////////////////////////// +/* template inline Lattice transpose(const Lattice &lhs){ Lattice ret(lhs.Grid()); @@ -48,7 +49,8 @@ inline Lattice transpose(const Lattice &lhs){ }); return ret; }; - +*/ + //////////////////////////////////////////////////////////////////////////////////////////////////// // Index level dependent transpose //////////////////////////////////////////////////////////////////////////////////////////////////// From 8b5b55b68228f1433108bde7d6dbef68dedc05fa Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 12 May 2020 17:57:24 -0400 Subject: [PATCH 45/51] Make tests all compile ccurrent Grid, mostly MdagM removal of norms fixes but a few minor issues fiixed too --- tests/IO/Test_openqcd_io.cc | 6 +++--- tests/core/Test_contfrac_even_odd.cc | 8 ++++---- tests/core/Test_dwf_eofa_even_odd.cc | 8 ++++---- tests/core/Test_dwf_even_odd.cc | 8 ++++---- tests/core/Test_gpwilson_even_odd.cc | 8 ++++---- tests/core/Test_mobius_eofa_even_odd.cc | 8 ++++---- tests/core/Test_mobius_even_odd.cc | 8 ++++---- tests/core/Test_staggered.cc | 8 ++++---- tests/core/Test_staggered5D.cc | 8 ++++---- tests/core/Test_wilson_even_odd.cc | 8 ++++---- tests/core/Test_wilson_twisted_mass_even_odd.cc | 8 ++++---- tests/core/Test_zmobius_even_odd.cc | 8 ++++---- tests/hmc/Test_multishift_sqrt.cc | 1 + tests/solver/Test_dwf_fpgcr.cc | 11 ++++++----- tests/solver/Test_staggered_block_cg_prec.cc | 11 ----------- tests/solver/Test_zMADWF_prec.cc | 2 +- 16 files changed, 55 insertions(+), 64 deletions(-) diff --git a/tests/IO/Test_openqcd_io.cc b/tests/IO/Test_openqcd_io.cc index 83b498c2..765509a9 100644 --- a/tests/IO/Test_openqcd_io.cc +++ b/tests/IO/Test_openqcd_io.cc @@ -28,13 +28,12 @@ See the full license in the file "LICENSE" in the top level distribution directo #include -#if defined(GRID_COMMS_NONE) -#error This test requires Grid compiled with MPI -#endif + using namespace Grid; int main(int argc, char** argv) { +#if !defined(GRID_COMMS_NONE) Grid_init(&argc, &argv); auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd()); @@ -81,4 +80,5 @@ int main(int argc, char** argv) { // clang-format on Grid_finalize(); +#endif } diff --git a/tests/core/Test_contfrac_even_odd.cc b/tests/core/Test_contfrac_even_odd.cc index 25affd00..5311f869 100644 --- a/tests/core/Test_contfrac_even_odd.cc +++ b/tests/core/Test_contfrac_even_odd.cc @@ -238,11 +238,11 @@ void TestWhat(What & Ddwf, RealD t1,t2; SchurDiagMooeeOperator HermOpEO(Ddwf); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_dwf_eofa_even_odd.cc b/tests/core/Test_dwf_eofa_even_odd.cc index 1d2f2909..01fff9ea 100644 --- a/tests/core/Test_dwf_eofa_even_odd.cc +++ b/tests/core/Test_dwf_eofa_even_odd.cc @@ -218,11 +218,11 @@ int main (int argc, char ** argv) RealD t1,t2; SchurDiagMooeeOperator HermOpEO(Ddwf); - HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2); - HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2); + HermOpEO.MpcDagMpc(chi_e, dchi_e); + HermOpEO.MpcDagMpc(chi_o, dchi_o); - HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2); - HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2); + HermOpEO.MpcDagMpc(phi_e, dphi_e); + HermOpEO.MpcDagMpc(phi_o, dphi_o); pDce = innerProduct(phi_e, dchi_e); pDco = innerProduct(phi_o, dchi_o); diff --git a/tests/core/Test_dwf_even_odd.cc b/tests/core/Test_dwf_even_odd.cc index d654e588..6093ee8f 100644 --- a/tests/core/Test_dwf_even_odd.cc +++ b/tests/core/Test_dwf_even_odd.cc @@ -216,11 +216,11 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc index ac4cde99..bf37f4d5 100644 --- a/tests/core/Test_gpwilson_even_odd.cc +++ b/tests/core/Test_gpwilson_even_odd.cc @@ -201,11 +201,11 @@ int main (int argc, char ** argv) RealD t1,t2; SchurDiagMooeeOperator HermOpEO(Dw); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_mobius_eofa_even_odd.cc b/tests/core/Test_mobius_eofa_even_odd.cc index bfd53c72..68091229 100644 --- a/tests/core/Test_mobius_eofa_even_odd.cc +++ b/tests/core/Test_mobius_eofa_even_odd.cc @@ -220,11 +220,11 @@ int main (int argc, char ** argv) RealD t1,t2; SchurDiagMooeeOperator HermOpEO(Ddwf); - HermOpEO.MpcDagMpc(chi_e, dchi_e, t1, t2); - HermOpEO.MpcDagMpc(chi_o, dchi_o, t1, t2); + HermOpEO.MpcDagMpc(chi_e, dchi_e); + HermOpEO.MpcDagMpc(chi_o, dchi_o); - HermOpEO.MpcDagMpc(phi_e, dphi_e, t1, t2); - HermOpEO.MpcDagMpc(phi_o, dphi_o, t1, t2); + HermOpEO.MpcDagMpc(phi_e, dphi_e); + HermOpEO.MpcDagMpc(phi_o, dphi_o); pDce = innerProduct(phi_e, dchi_e); pDco = innerProduct(phi_o, dchi_o); diff --git a/tests/core/Test_mobius_even_odd.cc b/tests/core/Test_mobius_even_odd.cc index 0a035dc8..7f808cac 100644 --- a/tests/core/Test_mobius_even_odd.cc +++ b/tests/core/Test_mobius_even_odd.cc @@ -266,11 +266,11 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_staggered.cc b/tests/core/Test_staggered.cc index c85d4090..1f42ff0d 100644 --- a/tests/core/Test_staggered.cc +++ b/tests/core/Test_staggered.cc @@ -270,11 +270,11 @@ int main (int argc, char ** argv) pickCheckerboard(Odd ,phi_o,phi); SchurDiagMooeeOperator HermOpEO(Ds); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc index 7055d183..68b8e537 100644 --- a/tests/core/Test_staggered5D.cc +++ b/tests/core/Test_staggered5D.cc @@ -289,11 +289,11 @@ int main (int argc, char ** argv) pickCheckerboard(Odd ,phi_o,phi); SchurDiagMooeeOperator HermOpEO(Ds); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_wilson_even_odd.cc b/tests/core/Test_wilson_even_odd.cc index 8f325f1c..dc49cf81 100644 --- a/tests/core/Test_wilson_even_odd.cc +++ b/tests/core/Test_wilson_even_odd.cc @@ -207,11 +207,11 @@ int main (int argc, char ** argv) RealD t1,t2; SchurDiagMooeeOperator HermOpEO(Dw); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_wilson_twisted_mass_even_odd.cc b/tests/core/Test_wilson_twisted_mass_even_odd.cc index 58b0b60f..ba80fd0e 100644 --- a/tests/core/Test_wilson_twisted_mass_even_odd.cc +++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc @@ -208,11 +208,11 @@ int main (int argc, char ** argv) RealD t1,t2; SchurDiagMooeeOperator HermOpEO(Dw); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/core/Test_zmobius_even_odd.cc b/tests/core/Test_zmobius_even_odd.cc index 1150930b..a52e9bc2 100644 --- a/tests/core/Test_zmobius_even_odd.cc +++ b/tests/core/Test_zmobius_even_odd.cc @@ -280,11 +280,11 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); - HermOpEO.MpcDagMpc(chi_e,dchi_e,t1,t2); - HermOpEO.MpcDagMpc(chi_o,dchi_o,t1,t2); + HermOpEO.MpcDagMpc(chi_e,dchi_e); + HermOpEO.MpcDagMpc(chi_o,dchi_o); - HermOpEO.MpcDagMpc(phi_e,dphi_e,t1,t2); - HermOpEO.MpcDagMpc(phi_o,dphi_o,t1,t2); + HermOpEO.MpcDagMpc(phi_e,dphi_e); + HermOpEO.MpcDagMpc(phi_o,dphi_o); pDce = innerProduct(phi_e,dchi_e); pDco = innerProduct(phi_o,dchi_o); diff --git a/tests/hmc/Test_multishift_sqrt.cc b/tests/hmc/Test_multishift_sqrt.cc index 834e6fc8..f8477220 100644 --- a/tests/hmc/Test_multishift_sqrt.cc +++ b/tests/hmc/Test_multishift_sqrt.cc @@ -57,6 +57,7 @@ public: // Support for coarsening to a multigrid void OpDiag (const Field &in, Field &out) {}; void OpDir (const Field &in, Field &out,int dir,int disp){}; + void OpDirAll (const Field &in, std::vector &out) {}; // Abstract base void Op (const Field &in, Field &out){ out = scale * in; diff --git a/tests/solver/Test_dwf_fpgcr.cc b/tests/solver/Test_dwf_fpgcr.cc index 226bd933..156f678a 100644 --- a/tests/solver/Test_dwf_fpgcr.cc +++ b/tests/solver/Test_dwf_fpgcr.cc @@ -70,9 +70,6 @@ int main (int argc, char ** argv) SU3::HotConfiguration(RNG4,Umu); - TrivialPrecon simple; - - PrecGeneralisedConjugateResidual PGCR(1.0e-6,10000,simple,4,160); ConjugateResidual CR(1.0e-6,10000); @@ -86,15 +83,19 @@ int main (int argc, char ** argv) std::cout< HermOp(Ddwf); + TrivialPrecon simple; + PrecGeneralisedConjugateResidual PGCR(1.0e-6,10000,HermOp,simple,4,160); + result=Zero(); - PGCR(HermOp,src,result); + PGCR(src,result); std::cout< g5HermOp(Ddwf); + PrecGeneralisedConjugateResidual PGCR5(1.0e-6,10000,g5HermOp,simple,4,160); result=Zero(); - PGCR(g5HermOp,src,result); + PGCR5(src,result); std::cout< template @@ -595,8 +606,7 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, auto vd_tf = qd_tf.View(); auto vs_ti = qs_ti.View(); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_loop = vq_loop[ss]; auto Dd_tf = vd_tf[ss]; auto Ds_ti = vs_ti[ss]; @@ -637,8 +647,7 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, auto vd_tf = qd_tf.View(); auto vs_ti = qs_ti.View(); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; auto Dd_tf = vd_tf[ss]; From 1f154fe652d90eb0de6ab3a662dedf4db6143ffb Mon Sep 17 00:00:00 2001 From: ferben Date: Tue, 19 May 2020 13:48:56 +0100 Subject: [PATCH 49/51] some cleanup in BaryonUtils --- Grid/qcd/utils/BaryonUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 9205802b..8c8ed59c 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -305,7 +305,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, //bytes += 3. * (grid->oSites() * 12. * 12. * sizeof(Complex)) * wick_contraction[ie]; // size of the 3 propagatorFields bytes += grid->oSites() * 36. * 4. * 4. * sizeof(Complex) * wick_contraction[ie]; //number of operations } - double t=0.; + Real t=0.; t =-usecond(); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { From 6c6812a5cab2f4920d355501427116cc79b090e4 Mon Sep 17 00:00:00 2001 From: ferben Date: Wed, 20 May 2020 12:26:57 +0100 Subject: [PATCH 50/51] GB/s output --- Grid/qcd/utils/BaryonUtils.h | 116 +++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 53 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 8c8ed59c..23267270 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -46,7 +46,7 @@ public: typedef typename SpinMatrixField::vector_object sobj; static const int epsilon[6][3] ; - static const Complex epsilon_sgn[6]; + static const Real epsilon_sgn[6]; private: template @@ -151,13 +151,16 @@ public: template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -template +/*template const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), Complex(1), Complex(1), Complex(-1), Complex(-1), Complex(-1)}; +*/ +template +const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; //This is the old version template @@ -174,13 +177,11 @@ void BaryonUtils::baryon_site(const mobj &D1, robj &result) { - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) auto gD1a = GammaA_left * GammaA_right * D1; auto gD1b = GammaA_left * g4 * GammaA_right * D1; - auto pD1 = 0.5* (gD1a + (double)parity * gD1b); + auto pD1 = 0.5* (gD1a + (Real)parity * gD1b); auto gD3 = GammaB_right * D3; - auto D2g = D2 * GammaB_left; auto pD1g = pD1 * GammaB_left; auto gD3g = gD3 * GammaB_left; @@ -193,72 +194,78 @@ void BaryonUtils::baryon_site(const mobj &D1, int a_right = epsilon[ie_right][0]; //a' int b_right = epsilon[ie_right][1]; //b' int c_right = epsilon[ie_right][2]; //c' - Complex ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; + Real ee = epsilon_sgn[ie_left] * epsilon_sgn[ie_right]; //This is the \delta_{456}^{123} part if (wick_contraction[0]){ for (int gamma_left=0; gamma_left::ContractBaryons(const PropagatorField &q1_left, assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); @@ -301,19 +308,22 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto v3 = q3_left.View(); Real bytes =0.; + bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); for (int ie=0; ie < 6 ; ie++){ - //bytes += 3. * (grid->oSites() * 12. * 12. * sizeof(Complex)) * wick_contraction[ie]; // size of the 3 propagatorFields - bytes += grid->oSites() * 36. * 4. * 4. * sizeof(Complex) * wick_contraction[ie]; //number of operations + if(ie==0 or ie==3){ + bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contraction[ie]; + } + else{ + bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contraction[ie]; + } } Real t=0.; t =-usecond(); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto D1 = v1[ss]; auto D2 = v2[ss]; auto D3 = v3[ss]; - vobj result=Zero(); baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); vbaryon_corr[ss] = result; @@ -343,10 +353,10 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); std::cout << "Contraction <" << quarks_right[0] << quarks_right[1] << quarks_right[2] << "|" << quarks_left[0] << quarks_left[1] << quarks_left[2] << ">" << std::endl; - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); @@ -354,8 +364,8 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, for (int ie=0; ie < 6 ; ie++) wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0; - result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); + result=Zero(); + baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contraction,result); } /*********************************************************************** From c5c2dbc0cef01817a4442093dc94e440f7193e09 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 2 Jun 2020 14:21:49 -0400 Subject: [PATCH 51/51] Optional CUDA info --- Grid/allocator/AlignedAllocator.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 77167299..2976aed7 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -92,6 +92,13 @@ public: static bool debug; }; +#ifdef GRID_NVCC +#define profilerCudaMeminfo \ + { size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<totalFreed) \ << std::endl; \ - } + } \ + profilerCudaMeminfo; #define profilerAllocate(bytes) \ if (MemoryProfiler::stats) \