diff --git a/.gitignore b/.gitignore index da7de5e4..5838caf7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ ################ *~ *# +*.sublime-* # Precompiled Headers # ####################### diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index dfaea627..1e51c9d2 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -113,6 +113,36 @@ int main (int argc, char ** argv) std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + zDw.CayleyReport(); \ + std::cout<Barrier(); \ + t0=usecond(); \ + for(int i=0;iBarrier(); \ + Dw.CayleyReport(); \ + std::cout< gamma(Ls,std::complex(1.0,0.0)); + ZMobiusFermionVec5dR zDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c); + std::cout<Barrier(); @@ -173,10 +209,13 @@ int main (int argc, char ** argv) BENCH_DW_MEO(Dhop ,src,result); BENCH_DW_MEO(DhopEO ,src_o,r_e); - BENCH_DW(Meooe ,src_o,r_e); + BENCH_DW_SSC(Meooe ,src_o,r_e); BENCH_DW(Mooee ,src_o,r_o); BENCH_DW(MooeeInv,src_o,r_o); + BENCH_ZDW(Mooee ,src_o,r_o); + BENCH_ZDW(MooeeInv,src_o,r_o); + } Grid_finalize(); diff --git a/configure.ac b/configure.ac index f413cde8..f848bd23 100644 --- a/configure.ac +++ b/configure.ac @@ -99,6 +99,13 @@ case ${ac_MKL} in AC_DEFINE([USE_MKL], [1], [Define to 1 if you use the Intel MKL]);; esac +############### HDF5 +AC_ARG_WITH([hdf5], + [AS_HELP_STRING([--with-hdf5=prefix], + [try this for a non-standard install prefix of the HDF5 library])], + [AM_CXXFLAGS="-I$with_hdf5/include $AM_CXXFLAGS"] + [AM_LDFLAGS="-L$with_hdf5/lib $AM_LDFLAGS"]) + ############### first-touch AC_ARG_ENABLE([numa], [AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])], @@ -145,6 +152,12 @@ AC_SEARCH_LIBS([fftw_execute], [fftw3], [AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the `FFTW' library])] [have_fftw=true]) +AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], + [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] + [have_hdf5=true] + [LIBS="${LIBS} -lhdf5"], [], [-lhdf5]) +AM_CONDITIONAL(BUILD_HDF5, [ test "${have_hdf5}X" == "trueX" ]) + CXXFLAGS=$CXXFLAGS_CPY LDFLAGS=$LDFLAGS_CPY @@ -410,6 +423,7 @@ RNG choice : ${ac_RNG} GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` LAPACK : ${ac_LAPACK} FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi` +HDF5 : `if test "x$have_hdf5" = xtrue; then echo yes; else echo no; fi` build DOXYGEN documentation : `if test "$DX_FLAG_doc" = '1'; then echo yes; else echo no; fi` ----- BUILD FLAGS ------------------------------------- CXXFLAGS: diff --git a/extras/Hadrons/Application.cc b/extras/Hadrons/Application.cc index 4bb3b383..62674f30 100644 --- a/extras/Hadrons/Application.cc +++ b/extras/Hadrons/Application.cc @@ -42,7 +42,6 @@ using namespace Hadrons; ******************************************************************************/ // constructors //////////////////////////////////////////////////////////////// Application::Application(void) -: env_(Environment::getInstance()) { LOG(Message) << "Modules available:" << std::endl; auto list = ModuleFactory::getInstance().getBuilderList(); @@ -74,11 +73,17 @@ Application::Application(const std::string parameterFileName) parameterFileName_ = parameterFileName; } +// environment shortcut //////////////////////////////////////////////////////// +Environment & Application::env(void) const +{ + return Environment::getInstance(); +} + // access ////////////////////////////////////////////////////////////////////// void Application::setPar(const Application::GlobalPar &par) { par_ = par; - env_.setSeed(strToVec(par_.seed)); + env().setSeed(strToVec(par_.seed)); } const Application::GlobalPar & Application::getPar(void) @@ -89,7 +94,7 @@ const Application::GlobalPar & Application::getPar(void) // execute ///////////////////////////////////////////////////////////////////// void Application::run(void) { - if (!parameterFileName_.empty() and (env_.getNModule() == 0)) + if (!parameterFileName_.empty() and (env().getNModule() == 0)) { parseParameterFile(parameterFileName_); } @@ -124,7 +129,7 @@ void Application::parseParameterFile(const std::string parameterFileName) do { read(reader, "id", id); - env_.createModule(id.name, id.type, reader); + env().createModule(id.name, id.type, reader); } while (reader.nextElement("module")); pop(reader); pop(reader); @@ -134,7 +139,7 @@ void Application::saveParameterFile(const std::string parameterFileName) { XmlWriter writer(parameterFileName); ObjectId id; - const unsigned int nMod = env_.getNModule(); + const unsigned int nMod = env().getNModule(); LOG(Message) << "Saving application to '" << parameterFileName << "'..." << std::endl; write(writer, "parameters", getPar()); @@ -142,10 +147,10 @@ void Application::saveParameterFile(const std::string parameterFileName) for (unsigned int i = 0; i < nMod; ++i) { push(writer, "module"); - id.name = env_.getModuleName(i); - id.type = env_.getModule(i)->getRegisteredName(); + id.name = env().getModuleName(i); + id.type = env().getModule(i)->getRegisteredName(); write(writer, "id", id); - env_.getModule(i)->saveParameters(writer, "options"); + env().getModule(i)->saveParameters(writer, "options"); pop(writer); } pop(writer); @@ -164,10 +169,10 @@ auto memPeak = [this](const std::vector &program)\ \ msg = HadronsLogMessage.isActive();\ HadronsLogMessage.Active(false);\ - env_.dryRun(true);\ - memPeak = env_.executeProgram(program);\ - env_.dryRun(false);\ - env_.freeAll();\ + env().dryRun(true);\ + memPeak = env().executeProgram(program);\ + env().dryRun(false);\ + env().freeAll();\ HadronsLogMessage.Active(true);\ \ return memPeak;\ @@ -179,7 +184,7 @@ void Application::schedule(void) // build module dependency graph LOG(Message) << "Building module graph..." << std::endl; - auto graph = env_.makeModuleGraph(); + auto graph = env().makeModuleGraph(); auto con = graph.getConnectedComponents(); // constrained topological sort using a genetic algorithm @@ -256,7 +261,7 @@ void Application::saveSchedule(const std::string filename) << std::endl; for (auto address: program_) { - program.push_back(env_.getModuleName(address)); + program.push_back(env().getModuleName(address)); } write(writer, "schedule", program); } @@ -274,7 +279,7 @@ void Application::loadSchedule(const std::string filename) program_.clear(); for (auto &name: program) { - program_.push_back(env_.getModuleAddress(name)); + program_.push_back(env().getModuleAddress(name)); } scheduled_ = true; memPeak_ = memPeak(program_); @@ -291,7 +296,7 @@ void Application::printSchedule(void) for (unsigned int i = 0; i < program_.size(); ++i) { LOG(Message) << std::setw(4) << i + 1 << ": " - << env_.getModuleName(program_[i]) << std::endl; + << env().getModuleName(program_[i]) << std::endl; } } @@ -304,9 +309,9 @@ void Application::configLoop(void) { LOG(Message) << BIG_SEP << " Starting measurement for trajectory " << t << " " << BIG_SEP << std::endl; - env_.setTrajectory(t); - env_.executeProgram(program_); + env().setTrajectory(t); + env().executeProgram(program_); } LOG(Message) << BIG_SEP << " End of measurement " << BIG_SEP << std::endl; - env_.freeAll(); + env().freeAll(); } diff --git a/extras/Hadrons/Application.hpp b/extras/Hadrons/Application.hpp index 4ea262df..fce9b6eb 100644 --- a/extras/Hadrons/Application.hpp +++ b/extras/Hadrons/Application.hpp @@ -98,11 +98,13 @@ public: void printSchedule(void); // loop on configurations void configLoop(void); +private: + // environment shortcut + Environment & env(void) const; private: long unsigned int locVol_; std::string parameterFileName_{""}; GlobalPar par_; - Environment &env_; std::vector program_; Environment::Size memPeak_; bool scheduled_{false}; @@ -115,14 +117,14 @@ private: template void Application::createModule(const std::string name) { - env_.createModule(name); + env().createModule(name); } template void Application::createModule(const std::string name, const typename M::Par &par) { - env_.createModule(name, par); + env().createModule(name, par); } END_HADRONS_NAMESPACE diff --git a/extras/Hadrons/Environment.cc b/extras/Hadrons/Environment.cc index 68c170b8..37f2a3d7 100644 --- a/extras/Hadrons/Environment.cc +++ b/extras/Hadrons/Environment.cc @@ -41,8 +41,9 @@ using namespace Hadrons; // constructor ///////////////////////////////////////////////////////////////// Environment::Environment(void) { + nd_ = GridDefaultLatt().size(); grid4d_.reset(SpaceTimeGrid::makeFourDimGrid( - GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()), + GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()), GridDefaultMpi())); gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get())); auto loc = getGrid()->LocalDimensions(); @@ -126,6 +127,11 @@ GridRedBlackCartesian * Environment::getRbGrid(const unsigned int Ls) const } } +unsigned int Environment::getNd(void) const +{ + return nd_; +} + // random number generator ///////////////////////////////////////////////////// void Environment::setSeed(const std::vector &seed) { diff --git a/extras/Hadrons/Environment.hpp b/extras/Hadrons/Environment.hpp index 041bcc0e..2628e5a0 100644 --- a/extras/Hadrons/Environment.hpp +++ b/extras/Hadrons/Environment.hpp @@ -106,6 +106,7 @@ public: void createGrid(const unsigned int Ls); GridCartesian * getGrid(const unsigned int Ls = 1) const; GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const; + unsigned int getNd(void) const; // random number generator void setSeed(const std::vector &seed); GridParallelRNG * get4dRng(void) const; @@ -200,6 +201,7 @@ private: std::map grid5d_; GridRbPt gridRb4d_; std::map gridRb5d_; + unsigned int nd_; // random number generator RngPt rng4d_; // module and related maps diff --git a/extras/Hadrons/GeneticScheduler.hpp b/extras/Hadrons/GeneticScheduler.hpp index c9256d96..d0c52596 100644 --- a/extras/Hadrons/GeneticScheduler.hpp +++ b/extras/Hadrons/GeneticScheduler.hpp @@ -166,7 +166,7 @@ void GeneticScheduler::initPopulation(void) { auto p = graph_.topoSort(gen_); - population_.emplace(func_(p), p); + population_.insert(std::make_pair(func_(p), p)); } } @@ -180,8 +180,8 @@ void GeneticScheduler::doCrossover(void) crossover(c1, c2, p1, p2); PARALLEL_CRITICAL { - population_.emplace(func_(c1), c1); - population_.emplace(func_(c2), c2); + population_.insert(std::make_pair(func_(c1), c1)); + population_.insert(std::make_pair(func_(c2), c2)); } } @@ -200,7 +200,7 @@ void GeneticScheduler::doMutation(void) mutation(m, it->second); PARALLEL_CRITICAL { - population_.emplace(func_(m), m); + population_.insert(std::make_pair(func_(m), m)); } } } diff --git a/extras/Hadrons/Modules/MSource/SeqGamma.hpp b/extras/Hadrons/Modules/MSource/SeqGamma.hpp index 181f9532..611b0108 100644 --- a/extras/Hadrons/Modules/MSource/SeqGamma.hpp +++ b/extras/Hadrons/Modules/MSource/SeqGamma.hpp @@ -147,7 +147,7 @@ void TSeqGamma::execute(void) g = makeGammaProd(par().gamma); p = strToVec(par().mom); ph = zero; - for(unsigned int mu = 0; mu < Nd; mu++) + for(unsigned int mu = 0; mu < env().getNd(); mu++) { LatticeCoordinate(coor, mu); ph = ph + p[mu]*coor; diff --git a/lib/AlignedAllocator.cc b/lib/AlignedAllocator.cc new file mode 100644 index 00000000..9df4ec1c --- /dev/null +++ b/lib/AlignedAllocator.cc @@ -0,0 +1,65 @@ + + + +#include + +namespace Grid { + +int PointerCache::victim; + + PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; + +void *PointerCache::Insert(void *ptr,size_t bytes) { + + if (bytes < 4096 ) return NULL; + +#ifdef _OPENMP + assert(omp_in_parallel()==0); +#endif + void * ret = NULL; + int v = -1; + + for(int e=0;e namespace Grid { + class PointerCache { + private: + + static const int Ncache=8; + static int victim; + + typedef struct { + void *address; + size_t bytes; + int valid; + } PointerCacheEntry; + + static PointerCacheEntry Entries[Ncache]; + + public: + + + static void *Insert(void *ptr,size_t bytes) ; + static void *Lookup(size_t bytes) ; + + }; + //////////////////////////////////////////////////////////////////// // A lattice of something, but assume the something is SIMDized. //////////////////////////////////////////////////////////////////// + template class alignedAllocator { public: @@ -66,27 +89,27 @@ public: pointer allocate(size_type __n, const void* _p= 0) { + size_type bytes = __n*sizeof(_Tp); + + _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); + #ifdef HAVE_MM_MALLOC_H - _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); #else - _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); + if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); #endif - _Tp tmp; -#ifdef GRID_NUMA -#pragma omp parallel for schedule(static) - for(int i=0;i<__n;i++){ - ptr[i]=tmp; - } -#endif return ptr; } - void deallocate(pointer __p, size_type) { + void deallocate(pointer __p, size_type __n) { + size_type bytes = __n * sizeof(_Tp); + pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); + #ifdef HAVE_MM_MALLOC_H - _mm_free((void *)__p); + if ( __freeme ) _mm_free((void *)__freeme); #else - free((void *)__p); + if ( __freeme ) free((void *)__freeme); #endif } void construct(pointer __p, const _Tp& __val) { }; diff --git a/lib/Grid.h b/lib/Grid.h index 0c5983f3..cb55d0c8 100644 --- a/lib/Grid.h +++ b/lib/Grid.h @@ -59,13 +59,13 @@ Author: paboyle /////////////////// // Grid headers /////////////////// -#include #include "Config.h" #include #include #include #include #include +#include #include #include #include diff --git a/lib/Makefile.am b/lib/Makefile.am index a779135f..fac622ca 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -1,4 +1,5 @@ extra_sources= +extra_headers= if BUILD_COMMS_MPI extra_sources+=communicator/Communicator_mpi.cc extra_sources+=communicator/Communicator_base.cc @@ -24,6 +25,12 @@ if BUILD_COMMS_NONE extra_sources+=communicator/Communicator_base.cc endif +if BUILD_HDF5 + extra_sources+=serialisation/Hdf5IO.cc + extra_headers+=serialisation/Hdf5IO.h + extra_headers+=serialisation/Hdf5Type.h +endif + # # Libraries # @@ -32,6 +39,9 @@ include Eigen.inc lib_LIBRARIES = libGrid.a -libGrid_a_SOURCES = $(CCFILES) $(extra_sources) +CCFILES += $(extra_sources) +HFILES += $(extra_headers) + +libGrid_a_SOURCES = $(CCFILES) libGrid_adir = $(pkgincludedir) nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h diff --git a/lib/PerfCount.h b/lib/PerfCount.h index 5ab07c02..749441c5 100644 --- a/lib/PerfCount.h +++ b/lib/PerfCount.h @@ -205,12 +205,13 @@ public: void Stop(void) { count=0; cycles=0; + size_t ign; #ifdef __linux__ if ( fd!= -1) { ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0); - ::read(fd, &count, sizeof(long long)); - ::read(cyclefd, &cycles, sizeof(long long)); + ign=::read(fd, &count, sizeof(long long)); + ign=::read(cyclefd, &cycles, sizeof(long long)); } elapsed = cyclecount() - begin; #else diff --git a/lib/Stencil.h b/lib/Stencil.h index 5c3a5ef9..89533b82 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -113,7 +113,7 @@ Gather_plane_simple_table (std::vector >& table,const Lattice { PARALLEL_FOR_LOOP for(int i=0;i &lowDim,Lattice & higherDim,int slice, int } // the above should guarantee that the operations are local - //PARALLEL_FOR_LOOP + PARALLEL_FOR_LOOP for(int idx=0;idxlSites();idx++){ std::vector lcoor(nl); std::vector hcoor(nh); @@ -428,7 +428,7 @@ void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, in } } // the above should guarantee that the operations are local - //PARALLEL_FOR_LOOP + PARALLEL_FOR_LOOP for(int idx=0;idxlSites();idx++){ std::vector lcoor(nl); std::vector hcoor(nh); diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index b8e98dce..781380e5 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -29,6 +29,7 @@ Author: paboyle *************************************************************************************/ /* END LEGAL */ +#include #include @@ -48,18 +49,18 @@ namespace QCD { FourDimGrid, FourDimRedBlackGrid,_M5,p), mass(_mass) - { } + { + } template void CayleyFermion5D::Dminus(const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - this->DW(psi,tmp,DaggerNo); + this->DW(psi,this->tmp(),DaggerNo); for(int s=0;stmp(),s,s);// chi = (1-c[s] D_W) psi } } @@ -87,8 +88,8 @@ template void CayleyFermion5D::CayleyReport(void) std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; - // Flops = 9*12*Ls*vol/2 - RealD mflops = 9.0*12*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting + // Flops = MADD * Ls *Ls *4dvol * spin/colour/complex + RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; } @@ -110,12 +111,11 @@ template void CayleyFermion5D::DminusDag(const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - this->DW(psi,tmp,DaggerYes); + this->DW(psi,this->tmp(),DaggerYes); for(int s=0;stmp(),s,s);// chi = (1-c[s] D_W) psi } } template @@ -138,6 +138,7 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D lower[0] =-mass*lower[0]; M5D(psi,psi,Din,lower,diag,upper); } +// FIXME Redunant with the above routine; check this and eliminate template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; @@ -259,36 +260,33 @@ template void CayleyFermion5D::Meooe (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - FermionField tmp(psi._grid); - Meooe5D(psi,tmp); + Meooe5D(psi,this->tmp()); if ( psi.checkerboard == Odd ) { - this->DhopEO(tmp,chi,DaggerNo); + this->DhopEO(this->tmp(),chi,DaggerNo); } else { - this->DhopOE(tmp,chi,DaggerNo); + this->DhopOE(this->tmp(),chi,DaggerNo); } } template void CayleyFermion5D::MeooeDag (const FermionField &psi, FermionField &chi) { - FermionField tmp(psi._grid); // Apply 4d dslash if ( psi.checkerboard == Odd ) { - this->DhopEO(psi,tmp,DaggerYes); + this->DhopEO(psi,this->tmp(),DaggerYes); } else { - this->DhopOE(psi,tmp,DaggerYes); + this->DhopOE(psi,this->tmp(),DaggerYes); } - MeooeDag5D(tmp,chi); + MeooeDag5D(this->tmp(),chi); } template void CayleyFermion5D::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ - FermionField tmp(psi._grid); - Meo5D(psi,tmp); + Meo5D(psi,this->tmp()); // Apply 4d dslash fragment - this->DhopDir(tmp,chi,dir,disp); + this->DhopDir(this->tmp(),chi,dir,disp); } // force terms; five routines; default to Dhop on diagonal template @@ -459,9 +457,91 @@ void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vectorMooeeInternalCompute(0,inv,MatpInv,MatmInv); + this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); + } +template +void CayleyFermion5D::MooeeInternalCompute(int dag, int inv, + Vector > & Matp, + Vector > & Matm) +{ + int Ls=this->Ls; + + GridBase *grid = this->FermionRedBlackGrid(); + int LLs = grid->_rdimensions[0]; + + if ( LLs == Ls ) return; // Not vectorised in 5th direction + + Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); + Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); + + for(int s=0;s::iscomplex() ) { + sp[l] = PplusMat (l*istride+s1*ostride,s2); + sm[l] = PminusMat(l*istride+s1*ostride,s2); + } else { + // if real + scalar_type tmp; + tmp = PplusMat (l*istride+s1*ostride,s2); + sp[l] = scalar_type(tmp.real(),tmp.real()); + tmp = PminusMat(l*istride+s1*ostride,s2); + sm[l] = scalar_type(tmp.real(),tmp.real()); + } + } + Matp[LLs*s2+s1] = Vp; + Matm[LLs*s2+s1] = Vm; + }} +} + FermOpTemplateInstantiate(CayleyFermion5D); GparityFermOpTemplateInstantiate(CayleyFermion5D); diff --git a/lib/qcd/action/fermion/CayleyFermion5D.h b/lib/qcd/action/fermion/CayleyFermion5D.h index 6fb58234..86255be6 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.h +++ b/lib/qcd/action/fermion/CayleyFermion5D.h @@ -33,6 +33,31 @@ namespace Grid { namespace QCD { + template struct switcheroo { + static inline int iscomplex() { return 0; } + + template + static inline vec mult(vec a, vec b) { + return real_mult(a,b); + } + }; + template<> struct switcheroo { + static inline int iscomplex() { return 1; } + + template + static inline vec mult(vec a, vec b) { + return a*b; + } + }; + template<> struct switcheroo { + static inline int iscomplex() { return 1; } + template + static inline vec mult(vec a, vec b) { + return a*b; + } + }; + + template class CayleyFermion5D : public WilsonFermion5D { @@ -75,7 +100,19 @@ namespace Grid { std::vector &lower, std::vector &diag, std::vector &upper); + void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv); + void MooeeInternalCompute(int dag, int inv, Vector > & Matp, Vector > & Matm); + + void MooeeInternalAsm(const FermionField &in, FermionField &out, + int LLs, int site, + Vector > &Matp, + Vector > &Matm); + void MooeeInternalZAsm(const FermionField &in, FermionField &out, + int LLs, int site, + Vector > &Matp, + Vector > &Matm); + virtual void Instantiatable(void)=0; @@ -112,6 +149,12 @@ namespace Grid { std::vector ueem; std::vector dee; + // Matrices of 5d ee inverse params + Vector > MatpInv; + Vector > MatmInv; + Vector > MatpInvDag; + Vector > MatmInvDag; + // Constructors CayleyFermion5D(GaugeField &_Umu, GridCartesian &FiveDimGrid, diff --git a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc index 35a10de2..ed742ea3 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dvec.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dvec.cc @@ -29,13 +29,12 @@ Author: paboyle *************************************************************************************/ /* END LEGAL */ -#include + #include namespace Grid { -namespace QCD { - /* +namespace QCD { /* * Dense matrix versions of routines */ template @@ -126,7 +125,6 @@ PARALLEL_FOR_LOOP for(int v=0;v(hp_00.v); hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); @@ -165,42 +160,20 @@ PARALLEL_FOR_LOOP hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); } - /* - if ( ss==0) std::cout << " dphi_00 " <::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(l[v]()()(),hm_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(l[v]()()(),hm_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(l[v]()()(),hm_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(l[v]()()(),hm_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(l[v]()()(),hm_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(l[v]()()(),hm_12); + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(u[v]()()(),hp_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(u[v]()()(),hp_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(u[v]()()(),hp_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(u[v]()()(),hp_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(u[v]()()(),hp_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(u[v]()()(),hp_12); - - // if ( ss==0){ - /* - std::cout << ss<<" "<< v<< " good "<< chi[ss+v]()(0)(0) << " bad "<::M5Ddag(const FermionField &psi, M5Dtime-=usecond(); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss+=LLs){ // adds LLs - +#if 0 alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hm; alignas(64) SiteSpinor fp; @@ -287,9 +260,504 @@ PARALLEL_FOR_LOOP chi[ss+v] = chi[ss+v] +l[v]*fm; } +#else + for(int v=0;v(hp_00.v); + hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v); + hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v); + hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v); + hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v); + hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v); + } + if ( vm>=v ) { + hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v); + hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v); + hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v); + hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v); + hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v); + hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v); + } + + Simd p_00 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo::mult(u[v]()()(),hp_00); + Simd p_01 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo::mult(u[v]()()(),hp_01); + Simd p_02 = switcheroo::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo::mult(u[v]()()(),hp_02); + Simd p_10 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo::mult(u[v]()()(),hp_10); + Simd p_11 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo::mult(u[v]()()(),hp_11); + Simd p_12 = switcheroo::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo::mult(u[v]()()(),hp_12); + + Simd p_20 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo::mult(l[v]()()(),hm_00); + Simd p_21 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo::mult(l[v]()()(),hm_01); + Simd p_22 = switcheroo::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo::mult(l[v]()()(),hm_02); + Simd p_30 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo::mult(l[v]()()(),hm_10); + Simd p_31 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo::mult(l[v]()()(),hm_11); + Simd p_32 = switcheroo::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo::mult(l[v]()()(),hm_12); + + vstream(chi[ss+v]()(0)(0),p_00); + vstream(chi[ss+v]()(0)(1),p_01); + vstream(chi[ss+v]()(0)(2),p_02); + vstream(chi[ss+v]()(1)(0),p_10); + vstream(chi[ss+v]()(1)(1),p_11); + vstream(chi[ss+v]()(1)(2),p_12); + vstream(chi[ss+v]()(2)(0),p_20); + vstream(chi[ss+v]()(2)(1),p_21); + vstream(chi[ss+v]()(2)(2),p_22); + vstream(chi[ss+v]()(3)(0),p_30); + vstream(chi[ss+v]()(3)(1),p_31); + vstream(chi[ss+v]()(3)(2),p_32); + } +#endif } M5Dtime+=usecond(); } + + +#ifdef AVX512 +#include +#include +#include +#endif + +template +void CayleyFermion5D::MooeeInternalAsm(const FermionField &psi, FermionField &chi, + int LLs, int site, + Vector > &Matp, + Vector > &Matm) +{ +#ifndef AVX512 + { + SiteHalfSpinor BcastP; + SiteHalfSpinor BcastM; + SiteHalfSpinor SiteChiP; + SiteHalfSpinor SiteChiM; + + // Ls*Ls * 2 * 12 * vol flops + for(int s1=0;s1); + for(int s1=0;s1 +void CayleyFermion5D::MooeeInternalZAsm(const FermionField &psi, FermionField &chi, + int LLs, int site, Vector > &Matp, Vector > &Matm) +{ +#ifndef AVX512 + { + SiteHalfSpinor BcastP; + SiteHalfSpinor BcastM; + SiteHalfSpinor SiteChiP; + SiteHalfSpinor SiteChiM; + + // Ls*Ls * 2 * 12 * vol flops + for(int s1=0;s1); + for(int s1=0;s1 void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) { @@ -299,108 +767,41 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField chi.checkerboard=psi.checkerboard; - Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); - Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); + Vector > Matp; + Vector > Matm; + Vector > *_Matp; + Vector > *_Matm; - for(int s=0;s > Matp(Ls*LLs); - Vector > Matm(Ls*LLs); + assert(_Matp->size()==Ls*LLs); - for(int s2=0;s2 SitePplus(LLs); - Vector SitePminus(LLs); - Vector SiteChiP(LLs); - Vector SiteChiM(LLs); - Vector SiteChi(LLs); - - SiteHalfSpinor BcastP; - SiteHalfSpinor BcastM; - -#pragma omp for - for(auto site=0;site::iscomplex() ) { + PARALLEL_FOR_LOOP + for(auto site=0;site::MooeeInternal(const Fermion template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); + }} diff --git a/lib/qcd/action/fermion/FermionOperator.h b/lib/qcd/action/fermion/FermionOperator.h index 742c6e08..676a0e83 100644 --- a/lib/qcd/action/fermion/FermionOperator.h +++ b/lib/qcd/action/fermion/FermionOperator.h @@ -48,6 +48,8 @@ namespace Grid { FermionOperator(const ImplParams &p= ImplParams()) : Impl(p) {}; + virtual FermionField &tmp(void) = 0; + GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); }; diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 99baa8a0..04c3671f 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -61,7 +61,9 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), - UmuOdd(&Hgrid) { + UmuOdd(&Hgrid), + _tmp(&Hgrid) +{ // Allocate the required comms buffer ImportGauge(_Umu); } diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 40fbd1bf..933be732 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -58,6 +58,9 @@ class WilsonFermion : public WilsonKernels, public WilsonFermionStatic { GridBase *FermionGrid(void) { return _grid; } GridBase *FermionRedBlackGrid(void) { return _cbgrid; } + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + ////////////////////////////////////////////////////////////////// // override multiply; cut number routines if pass dagger argument // and also make interface more uniformly consistent diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index d2ac96e3..d70c98c3 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -60,7 +60,8 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, UmuEven(_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid), Lebesgue(_FourDimGrid), - LebesgueEvenOdd(_FourDimRedBlackGrid) + LebesgueEvenOdd(_FourDimRedBlackGrid), + _tmp(&FiveDimRedBlackGrid) { if (Impl::LsVectorised) { diff --git a/lib/qcd/action/fermion/WilsonFermion5D.h b/lib/qcd/action/fermion/WilsonFermion5D.h index ffb5c58e..fb4fa925 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.h +++ b/lib/qcd/action/fermion/WilsonFermion5D.h @@ -74,6 +74,9 @@ namespace QCD { typedef WilsonKernels Kernels; PmuStat stat; + FermionField _tmp; + FermionField &tmp(void) { return _tmp; } + void Report(void); void ZeroCounters(void); double DhopCalls; diff --git a/lib/serialisation/BaseIO.h b/lib/serialisation/BaseIO.h index 7761a8e6..0357915d 100644 --- a/lib/serialisation/BaseIO.h +++ b/lib/serialisation/BaseIO.h @@ -32,6 +32,7 @@ Author: Peter Boyle #include namespace Grid { + // Vector IO utilities /////////////////////////////////////////////////////// // helper function to read space-separated values template std::vector strToVec(const std::string s) @@ -67,6 +68,77 @@ namespace Grid { return os; } + // Vector element trait ////////////////////////////////////////////////////// + template + struct element + { + typedef T type; + static constexpr bool is_number = false; + }; + + template + struct element> + { + typedef typename element::type type; + static constexpr bool is_number = std::is_arithmetic::value + or is_complex::value + or element::is_number; + }; + + // Vector flatening utility class //////////////////////////////////////////// + // Class to flatten a multidimensional std::vector + template + class Flatten + { + public: + typedef typename element::type Element; + public: + explicit Flatten(const V &vector); + const V & getVector(void); + const std::vector & getFlatVector(void); + const std::vector & getDim(void); + private: + void accumulate(const Element &e); + template + void accumulate(const W &v); + void accumulateDim(const Element &e); + template + void accumulateDim(const W &v); + private: + const V &vector_; + std::vector flatVector_; + std::vector dim_; + }; + + + // Class to reconstruct a multidimensional std::vector + template + class Reconstruct + { + public: + typedef typename element::type Element; + public: + Reconstruct(const std::vector &flatVector, + const std::vector &dim); + const V & getVector(void); + const std::vector & getFlatVector(void); + const std::vector & getDim(void); + private: + void fill(std::vector &v); + template + void fill(W &v); + void resize(std::vector &v, const unsigned int dim); + template + void resize(W &v, const unsigned int dim); + private: + V vector_; + const std::vector &flatVector_; + std::vector dim_; + size_t ind_{0}; + unsigned int dimInd_{0}; + }; + + // Abstract writer/reader classes //////////////////////////////////////////// // static polymorphism implemented using CRTP idiom class Serializable; @@ -83,12 +155,7 @@ namespace Grid { typename std::enable_if::value, void>::type write(const std::string& s, const U &output); template - typename std::enable_if::value, void>::type - write(const std::string& s, const U &output); - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type write(const std::string& s, const U &output); private: T *upcast; @@ -107,12 +174,7 @@ namespace Grid { typename std::enable_if::value, void>::type read(const std::string& s, U &output); template - typename std::enable_if::value, void>::type - read(const std::string& s, U &output); - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type read(const std::string& s, U &output); protected: template @@ -142,7 +204,128 @@ namespace Grid { } }; - // Generic writer interface + // Flatten class template implementation ///////////////////////////////////// + template + void Flatten::accumulate(const Element &e) + { + flatVector_.push_back(e); + } + + template + template + void Flatten::accumulate(const W &v) + { + for (auto &e: v) + { + accumulate(e); + } + } + + template + void Flatten::accumulateDim(const Element &e) {}; + + template + template + void Flatten::accumulateDim(const W &v) + { + dim_.push_back(v.size()); + accumulateDim(v[0]); + } + + template + Flatten::Flatten(const V &vector) + : vector_(vector) + { + accumulate(vector_); + accumulateDim(vector_); + } + + template + const V & Flatten::getVector(void) + { + return vector_; + } + + template + const std::vector::Element> & + Flatten::getFlatVector(void) + { + return flatVector_; + } + + template + const std::vector & Flatten::getDim(void) + { + return dim_; + } + + // Reconstruct class template implementation ///////////////////////////////// + template + void Reconstruct::fill(std::vector &v) + { + for (auto &e: v) + { + e = flatVector_[ind_++]; + } + } + + template + template + void Reconstruct::fill(W &v) + { + for (auto &e: v) + { + fill(e); + } + } + + template + void Reconstruct::resize(std::vector &v, const unsigned int dim) + { + v.resize(dim_[dim]); + } + + template + template + void Reconstruct::resize(W &v, const unsigned int dim) + { + v.resize(dim_[dim]); + for (auto &e: v) + { + resize(e, dim + 1); + } + } + + template + Reconstruct::Reconstruct(const std::vector &flatVector, + const std::vector &dim) + : flatVector_(flatVector) + , dim_(dim) + { + resize(vector_, 0); + fill(vector_); + } + + template + const V & Reconstruct::getVector(void) + { + return vector_; + } + + template + const std::vector::Element> & + Reconstruct::getFlatVector(void) + { + return flatVector_; + } + + template + const std::vector & Reconstruct::getDim(void) + { + return dim_; + } + + // Generic writer interface ////////////////////////////////////////////////// template inline void push(Writer &w, const std::string &s) { @@ -221,23 +404,13 @@ namespace Grid { template template - typename std::enable_if::value, void>::type - Writer::write(const std::string &s, const U &output) - { - EnumIO::write(*this, s, output); - } - - template - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type Writer::write(const std::string &s, const U &output) { upcast->writeDefault(s, output); } - // Reader template implementation //////////////////////////////////////////// + // Reader template implementation template Reader::Reader(void) { @@ -266,17 +439,7 @@ namespace Grid { template template - typename std::enable_if::value, void>::type - Reader::read(const std::string &s, U &output) - { - EnumIO::read(*this, s, output); - } - - template - template - typename std::enable_if< - !(std::is_base_of::value or std::is_enum::value), - void>::type + typename std::enable_if::value, void>::type Reader::read(const std::string &s, U &output) { upcast->readDefault(s, output); @@ -300,7 +463,6 @@ namespace Grid { abort(); } } - } #endif diff --git a/lib/serialisation/Hdf5IO.cc b/lib/serialisation/Hdf5IO.cc new file mode 100644 index 00000000..c5313495 --- /dev/null +++ b/lib/serialisation/Hdf5IO.cc @@ -0,0 +1,103 @@ +#include + +using namespace Grid; +#ifndef H5_NO_NAMESPACE +using namespace H5NS; +#endif + +// Writer implementation /////////////////////////////////////////////////////// +Hdf5Writer::Hdf5Writer(const std::string &fileName) +: fileName_(fileName) +, file_(fileName.c_str(), H5F_ACC_TRUNC) +{ + group_ = file_.openGroup("/"); + writeSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold", + Hdf5Type::type()); +} + +void Hdf5Writer::push(const std::string &s) +{ + group_ = group_.createGroup(s); + path_.push_back(s); +} + +void Hdf5Writer::pop(void) +{ + path_.pop_back(); + if (path_.empty()) + { + group_ = file_.openGroup("/"); + } + else + { + auto binOp = [](const std::string &a, const std::string &b)->std::string + { + return a + "/" + b; + }; + + group_ = group_.openGroup(std::accumulate(path_.begin(), path_.end(), + std::string(""), binOp)); + } +} + +template <> +void Hdf5Writer::writeDefault(const std::string &s, const std::string &x) +{ + StrType strType(PredType::C_S1, x.size()); + + writeSingleAttribute(*(x.data()), s, strType); +} + +void Hdf5Writer::writeDefault(const std::string &s, const char *x) +{ + std::string sx(x); + + writeDefault(s, sx); +} + +// Reader implementation /////////////////////////////////////////////////////// +Hdf5Reader::Hdf5Reader(const std::string &fileName) +: fileName_(fileName) +, file_(fileName.c_str(), H5F_ACC_RDONLY) +{ + group_ = file_.openGroup("/"); + readSingleAttribute(dataSetThres_, HDF5_GRID_GUARD "dataset_threshold", + Hdf5Type::type()); +} + +void Hdf5Reader::push(const std::string &s) +{ + group_ = group_.openGroup(s); + path_.push_back(s); +} + +void Hdf5Reader::pop(void) +{ + path_.pop_back(); + if (path_.empty()) + { + group_ = file_.openGroup("/"); + } + else + { + auto binOp = [](const std::string &a, const std::string &b)->std::string + { + return a + "/" + b; + }; + + group_ = group_.openGroup(std::accumulate(path_.begin(), path_.end(), + std::string(""), binOp)); + } +} + +template <> +void Hdf5Reader::readDefault(const std::string &s, std::string &x) +{ + Attribute attribute; + + attribute = group_.openAttribute(s); + StrType strType = attribute.getStrType(); + + x.resize(strType.getSize()); + attribute.read(strType, &(x[0])); +} diff --git a/lib/serialisation/Hdf5IO.h b/lib/serialisation/Hdf5IO.h new file mode 100644 index 00000000..2f891cd4 --- /dev/null +++ b/lib/serialisation/Hdf5IO.h @@ -0,0 +1,242 @@ +#ifndef GRID_SERIALISATION_HDF5_H +#define GRID_SERIALISATION_HDF5_H + +#include +#include +#include +#include +#include "Hdf5Type.h" + +#ifndef H5_NO_NAMESPACE +#define H5NS H5 +#endif + +// default thresold above which datasets are used instead of attributes +#ifndef HDF5_DEF_DATASET_THRES +#define HDF5_DEF_DATASET_THRES 6u +#endif + +// name guard for Grid metadata +#define HDF5_GRID_GUARD "_Grid_" + +namespace Grid +{ + class Hdf5Writer: public Writer + { + public: + Hdf5Writer(const std::string &fileName); + virtual ~Hdf5Writer(void) = default; + void push(const std::string &s); + void pop(void); + void writeDefault(const std::string &s, const char *x); + template + void writeDefault(const std::string &s, const U &x); + template + typename std::enable_if>::is_number, void>::type + writeDefault(const std::string &s, const std::vector &x); + template + typename std::enable_if>::is_number, void>::type + writeDefault(const std::string &s, const std::vector &x); + private: + template + void writeSingleAttribute(const U &x, const std::string &name, + const H5NS::DataType &type); + private: + std::string fileName_; + std::vector path_; + H5NS::H5File file_; + H5NS::Group group_; + unsigned int dataSetThres_{HDF5_DEF_DATASET_THRES}; + }; + + class Hdf5Reader: public Reader + { + public: + Hdf5Reader(const std::string &fileName); + virtual ~Hdf5Reader(void) = default; + void push(const std::string &s); + void pop(void); + template + void readDefault(const std::string &s, U &output); + template + typename std::enable_if>::is_number, void>::type + readDefault(const std::string &s, std::vector &x); + template + typename std::enable_if>::is_number, void>::type + readDefault(const std::string &s, std::vector &x); + private: + template + void readSingleAttribute(U &x, const std::string &name, + const H5NS::DataType &type); + private: + std::string fileName_; + std::vector path_; + H5NS::H5File file_; + H5NS::Group group_; + unsigned int dataSetThres_; + }; + + // Writer template implementation //////////////////////////////////////////// + template + void Hdf5Writer::writeSingleAttribute(const U &x, const std::string &name, + const H5NS::DataType &type) + { + H5NS::Attribute attribute; + hsize_t attrDim = 1; + H5NS::DataSpace attrSpace(1, &attrDim); + + attribute = group_.createAttribute(name, type, attrSpace); + attribute.write(type, &x); + } + + template + void Hdf5Writer::writeDefault(const std::string &s, const U &x) + { + writeSingleAttribute(x, s, Hdf5Type::type()); + } + + template <> + void Hdf5Writer::writeDefault(const std::string &s, const std::string &x); + + template + typename std::enable_if>::is_number, void>::type + Hdf5Writer::writeDefault(const std::string &s, const std::vector &x) + { + // alias to element type + typedef typename element>::type Element; + + // flatten the vector and getting dimensions + Flatten> flat(x); + std::vector dim; + const auto &flatx = flat.getFlatVector(); + + for (auto &d: flat.getDim()) + { + dim.push_back(d); + } + + // write to file + H5NS::DataSpace dataSpace(dim.size(), dim.data()); + + if (flatx.size() > dataSetThres_) + { + H5NS::DataSet dataSet; + + dataSet = group_.createDataSet(s, Hdf5Type::type(), dataSpace); + dataSet.write(flatx.data(), Hdf5Type::type()); + } + else + { + H5NS::Attribute attribute; + + attribute = group_.createAttribute(s, Hdf5Type::type(), dataSpace); + attribute.write(Hdf5Type::type(), flatx.data()); + } + } + + template + typename std::enable_if>::is_number, void>::type + Hdf5Writer::writeDefault(const std::string &s, const std::vector &x) + { + push(s); + writeSingleAttribute(x.size(), HDF5_GRID_GUARD "vector_size", + Hdf5Type::type()); + for (hsize_t i = 0; i < x.size(); ++i) + { + write(s + "_" + std::to_string(i), x[i]); + } + pop(); + } + + // Reader template implementation //////////////////////////////////////////// + template + void Hdf5Reader::readSingleAttribute(U &x, const std::string &name, + const H5NS::DataType &type) + { + H5NS::Attribute attribute; + + attribute = group_.openAttribute(name); + attribute.read(type, &x); + } + + template + void Hdf5Reader::readDefault(const std::string &s, U &output) + { + readSingleAttribute(output, s, Hdf5Type::type()); + } + + template <> + void Hdf5Reader::readDefault(const std::string &s, std::string &x); + + template + typename std::enable_if>::is_number, void>::type + Hdf5Reader::readDefault(const std::string &s, std::vector &x) + { + // alias to element type + typedef typename element>::type Element; + + // read the dimensions + H5NS::DataSpace dataSpace; + std::vector hdim; + std::vector dim; + hsize_t size = 1; + + if (group_.attrExists(s)) + { + dataSpace = group_.openAttribute(s).getSpace(); + } + else + { + dataSpace = group_.openDataSet(s).getSpace(); + } + hdim.resize(dataSpace.getSimpleExtentNdims()); + dataSpace.getSimpleExtentDims(hdim.data()); + for (auto &d: hdim) + { + dim.push_back(d); + size *= d; + } + + // read the flat vector + std::vector buf(size); + + if (size > dataSetThres_) + { + H5NS::DataSet dataSet; + + dataSet = group_.openDataSet(s); + dataSet.read(buf.data(), Hdf5Type::type()); + } + else + { + H5NS::Attribute attribute; + + attribute = group_.openAttribute(s); + attribute.read(Hdf5Type::type(), buf.data()); + } + + // reconstruct the multidimensional vector + Reconstruct> r(buf, dim); + + x = r.getVector(); + } + + template + typename std::enable_if>::is_number, void>::type + Hdf5Reader::readDefault(const std::string &s, std::vector &x) + { + uint64_t size; + + push(s); + readSingleAttribute(size, HDF5_GRID_GUARD "vector_size", + Hdf5Type::type()); + x.resize(size); + for (hsize_t i = 0; i < x.size(); ++i) + { + read(s + "_" + std::to_string(i), x[i]); + } + pop(); + } +} + +#endif diff --git a/lib/serialisation/Hdf5Type.h b/lib/serialisation/Hdf5Type.h new file mode 100644 index 00000000..8634f35b --- /dev/null +++ b/lib/serialisation/Hdf5Type.h @@ -0,0 +1,77 @@ +#ifndef GRID_SERIALISATION_HDF5_TYPE_H +#define GRID_SERIALISATION_HDF5_TYPE_H + +#include +#include +#include + +#ifndef H5_NO_NAMESPACE +#define H5NS H5 +#endif + +#define HDF5_NATIVE_TYPE(predType, cType)\ +template <>\ +class Hdf5Type\ +{\ +public:\ + static inline const H5NS::DataType & type(void)\ + {\ + return H5NS::PredType::predType;\ + }\ + static constexpr bool isNative = true;\ +}; + +#define DEFINE_HDF5_NATIVE_TYPES \ +HDF5_NATIVE_TYPE(NATIVE_B8, bool);\ +HDF5_NATIVE_TYPE(NATIVE_CHAR, char);\ +HDF5_NATIVE_TYPE(NATIVE_SCHAR, signed char);\ +HDF5_NATIVE_TYPE(NATIVE_UCHAR, unsigned char);\ +HDF5_NATIVE_TYPE(NATIVE_SHORT, short);\ +HDF5_NATIVE_TYPE(NATIVE_USHORT, unsigned short);\ +HDF5_NATIVE_TYPE(NATIVE_INT, int);\ +HDF5_NATIVE_TYPE(NATIVE_UINT, unsigned int);\ +HDF5_NATIVE_TYPE(NATIVE_LONG, long);\ +HDF5_NATIVE_TYPE(NATIVE_ULONG, unsigned long);\ +HDF5_NATIVE_TYPE(NATIVE_LLONG, long long);\ +HDF5_NATIVE_TYPE(NATIVE_ULLONG, unsigned long long);\ +HDF5_NATIVE_TYPE(NATIVE_FLOAT, float);\ +HDF5_NATIVE_TYPE(NATIVE_DOUBLE, double);\ +HDF5_NATIVE_TYPE(NATIVE_LDOUBLE, long double); + +namespace Grid +{ + template class Hdf5Type + { + public: + static constexpr bool isNative = false; + }; + + DEFINE_HDF5_NATIVE_TYPES; + + template + class Hdf5Type> + { + public: + static inline const H5NS::DataType & type(void) + { + if (typePtr_ == nullptr) + { + typePtr_.reset(new H5NS::CompType(sizeof(std::complex))); + typePtr_->insertMember("re", 0, Hdf5Type::type()); + typePtr_->insertMember("im", sizeof(R), Hdf5Type::type()); + } + + return *typePtr_; + } + static constexpr bool isNative = false; + private: + static std::unique_ptr typePtr_; + }; + + template + std::unique_ptr Hdf5Type>::typePtr_ = nullptr; +} + +#undef HDF5_NATIVE_TYPE + +#endif /* GRID_SERIALISATION_HDF5_TYPE_H */ diff --git a/lib/serialisation/MacroMagic.h b/lib/serialisation/MacroMagic.h index c78bba0c..8b027f30 100644 --- a/lib/serialisation/MacroMagic.h +++ b/lib/serialisation/MacroMagic.h @@ -109,40 +109,36 @@ THE SOFTWARE. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #define GRID_MACRO_MEMBER(A,B) A B; +#define GRID_MACRO_COMP_MEMBER(A,B) result = (result and (lhs. B == rhs. B)); #define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <\ - static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ - push(WR,s);\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ - pop(WR);\ - } \ - \ - \ - template \ - static inline void read(Reader &RD,const std::string &s, cname &obj){ \ - push(RD,s);\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ - pop(RD);\ - } \ - \ - \ - friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \ - os<<"class "<<#cname<<" {"<\ +static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ + push(WR,s);\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ + pop(WR);\ +}\ +template \ +static inline void read(Reader &RD,const std::string &s, cname &obj){ \ + push(RD,s);\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \ + pop(RD);\ +}\ +friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \ + os<<"class "<<#cname<<" {"<::type #define GRID_MACRO_ENUMVAL(A,B) A = B, @@ -150,44 +146,52 @@ THE SOFTWARE. #define GRID_MACRO_ENUMTEST(A,B) else if (buf == #A) {obj = GRID_ENUM_TYPE(obj)::A;} #define GRID_MACRO_ENUMCASEIO(A,B) case GRID_ENUM_TYPE(obj)::A: os << #A; break; -namespace Grid { - template - class EnumIO {}; -} - #define GRID_SERIALIZABLE_ENUM(name,undefname,...)\ - enum class name {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\ - undefname = -1\ +class name: public Grid::Serializable\ +{\ +public:\ + enum EnumType\ + {\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMVAL,__VA_ARGS__))\ + undefname = -1\ };\ +public:\ + name(void): value_(undefname) {};\ + name(EnumType value): value_(value) {};\ + template \ + static inline void write(Grid::Writer &WR,const std::string &s, const name &obj)\ + {\ + switch (obj.value_)\ + {\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\ + default: Grid::write(WR,s,#undefname); break;\ + }\ + }\ \ - template<>\ - class EnumIO {\ - public:\ - template \ - static inline void write(Writer &WR,const std::string &s, const name &obj){ \ - switch (obj) {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\ - default: Grid::write(WR,s,#undefname); break;\ - }\ - }\ - \ - template \ - static inline void read(Reader &RD,const std::string &s, name &obj){ \ - std::string buf;\ - Grid::read(RD, s, buf);\ - if (buf == #undefname) {obj = name::undefname;}\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\ - else {obj = name::undefname;}\ - }\ - };\ - \ - inline std::ostream & operator << (std::ostream &os, const name &obj ) { \ + template \ + static inline void read(Grid::Reader &RD,const std::string &s, name &obj)\ + {\ + std::string buf;\ + Grid::read(RD, s, buf);\ + if (buf == #undefname) {obj = name::undefname;}\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMTEST,__VA_ARGS__))\ + else {obj = name::undefname;}\ + }\ + inline operator EnumType(void) const\ + {\ + return value_;\ + }\ + inline friend std::ostream & operator<<(std::ostream &os, const name &obj)\ + {\ switch (obj) {\ - GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\ - default: os << #undefname; break;\ + GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\ + default: os << #undefname; break;\ }\ return os;\ - }; + }\ +private:\ + EnumType value_;\ +}; + #endif diff --git a/lib/serialisation/Serialisation.h b/lib/serialisation/Serialisation.h index 8f405d73..aa84e989 100644 --- a/lib/serialisation/Serialisation.h +++ b/lib/serialisation/Serialisation.h @@ -36,6 +36,9 @@ Author: Peter Boyle #include "BinaryIO.h" #include "TextIO.h" #include "XmlIO.h" +#ifdef HAVE_HDF5 +#include "Hdf5IO.h" +#endif ////////////////////////////////////////// // Todo: ////////////////////////////////////////// diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 36360102..724f52bb 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -213,6 +213,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m256 operator()(__m256 a, __m256 b){ + __m256 ymm0; + ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m256d operator()(__m256d a, __m256d b){ + __m256d ymm0; + ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m256 operator()(__m256 a, __m256 b, __m256 c){ + __m256 ymm0 = _mm256_moveldup_ps(a); // ymm0 <- ar ar, + return _mm256_add_ps(_mm256_mul_ps( ymm0, b),c); + } + inline __m256d operator()(__m256d a, __m256d b, __m256d c){ + __m256d ymm0 = _mm256_shuffle_pd( a, a, 0x0 ); + return _mm256_add_pd(_mm256_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m256 operator()(__m256 a, __m256 b){ @@ -627,7 +650,9 @@ namespace Optimization { typedef Optimization::Sub SubSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h index d6531d57..ebf99e16 100644 --- a/lib/simd/Grid_avx512.h +++ b/lib/simd/Grid_avx512.h @@ -189,6 +189,29 @@ namespace Optimization { // 2mul,4 mac +add+sub = 8 flop type insns // 3shuf + 2 (+shuf) = 5/6 simd perm and 1/2 the load. + struct MultRealPart{ + inline __m512 operator()(__m512 a, __m512 b){ + __m512 ymm0; + ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m512d operator()(__m512d a, __m512d b){ + __m512d ymm0; + ymm0 = _mm512_shuffle_pd(a,a,0x00); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm512_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m512 operator()(__m512 a, __m512 b, __m512 c){ + __m512 ymm0 = _mm512_moveldup_ps(a); // ymm0 <- ar ar, + return _mm512_fmadd_ps( ymm0, b, c); + } + inline __m512d operator()(__m512d a, __m512d b, __m512d c){ + __m512d ymm0 = _mm512_shuffle_pd( a, a, 0x00 ); + return _mm512_fmadd_pd( ymm0, b, c); + } + }; + struct MultComplex{ // Complex float inline __m512 operator()(__m512 a, __m512 b){ @@ -501,6 +524,8 @@ namespace Optimization { typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_generic.h b/lib/simd/Grid_generic.h index 62c78afb..91e9cda2 100644 --- a/lib/simd/Grid_generic.h +++ b/lib/simd/Grid_generic.h @@ -224,6 +224,21 @@ namespace Optimization { #define cmul(a, b, c, i)\ c[i] = a[i]*b[i] - a[i+1]*b[i+1];\ c[i+1] = a[i]*b[i+1] + a[i+1]*b[i]; + + struct MultRealPart{ + template + inline vec operator()(vec a, vec b){ + vec out; + + VECTOR_FOR(i, W::c, 1) + { + out.v[2*i] = a[2*i]*b[2*i]; + out.v[2*i+1] = a[2*i]*b[2*i+1]; + } + return out; + }; + }; + struct MultComplex{ // Complex @@ -456,6 +471,7 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index bc86291d..99a9ea68 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -220,6 +220,14 @@ namespace Optimization { } }; + struct MultRealPart{ + // Complex double + inline vector4double operator()(vector4double a, vector4double b){ + // return vec_xmul(b, a); + return vec_xmul(a, b); + } + FLOAT_WRAP_2(operator(), inline) + }; struct MultComplex{ // Complex double inline vector4double operator()(vector4double a, vector4double b){ @@ -430,6 +438,7 @@ typedef Optimization::Sub SubSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::Div DivSIMD; typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::MultRealPart MultRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h index 560eda11..943756b2 100644 --- a/lib/simd/Grid_sse4.h +++ b/lib/simd/Grid_sse4.h @@ -177,6 +177,29 @@ namespace Optimization { } }; + struct MultRealPart{ + inline __m128 operator()(__m128 a, __m128 b){ + __m128 ymm0; + ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + } + inline __m128d operator()(__m128d a, __m128d b){ + __m128d ymm0; + ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00 + return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + } + }; + struct MaddRealPart{ + inline __m128 operator()(__m128 a, __m128 b, __m128 c){ + __m128 ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar, + return _mm_add_ps(_mm_mul_ps( ymm0, b),c); + } + inline __m128d operator()(__m128d a, __m128d b, __m128d c){ + __m128d ymm0 = _mm_shuffle_pd( a, a, 0x0 ); + return _mm_add_pd(_mm_mul_pd( ymm0, b),c); + } + }; + struct MultComplex{ // Complex float inline __m128 operator()(__m128 a, __m128 b){ @@ -325,9 +348,11 @@ namespace Optimization { } } +#ifndef _mm_alignr_epi64 #define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16) #define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16) - +#endif + template static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); }; template static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); }; @@ -415,6 +440,8 @@ namespace Optimization { typedef Optimization::Div DivSIMD; typedef Optimization::Mult MultSIMD; typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; typedef Optimization::Conj ConjSIMD; typedef Optimization::TimesMinusI TimesMinusISIMD; typedef Optimization::TimesI TimesISIMD; diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 42f28b34..8a6ab2e7 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -101,6 +101,11 @@ template using IfNotInteger = Invoke +Out trinary(Input1 src_1, Input2 src_2, Input3 src_3, Operation op) { + return op(src_1, src_2, src_3); +} + template Out binary(Input1 src_1, Input2 src_2, Operation op) { return op(src_1, src_2); @@ -178,6 +183,7 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) * (*r); } + friend inline void sub(Grid_simd *__restrict__ y, const Grid_simd *__restrict__ l, const Grid_simd *__restrict__ r) { @@ -188,7 +194,6 @@ class Grid_simd { const Grid_simd *__restrict__ r) { *y = (*l) + (*r); } - friend inline void mac(Grid_simd *__restrict__ y, const Scalar_type *__restrict__ a, const Grid_simd *__restrict__ x) { @@ -260,7 +265,7 @@ class Grid_simd { } //////////////////////////// - // opreator scalar * simd + // operator scalar * simd //////////////////////////// friend inline Grid_simd operator*(const Scalar_type &a, Grid_simd b) { Grid_simd va; @@ -433,6 +438,11 @@ inline void vbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ S* typepun =(S*) &src; vsplat(ret,typepun[lane]); } +template =0> +inline void rbroadcast(Grid_simd &ret,const Grid_simd &src,int lane){ + S* typepun =(S*) &src; + ret.v = unary(real(typepun[lane]), VsplatSIMD()); +} /////////////////////// // Splat @@ -449,6 +459,10 @@ template inline void vsplat(Grid_simd &ret, EnableIf, S> c) { vsplat(ret, real(c), imag(c)); } +template +inline void rsplat(Grid_simd &ret, EnableIf, S> c) { + vsplat(ret, real(c), real(c)); +} // if real fill with a, if complex fill with a in the real part (first function // above) @@ -550,6 +564,21 @@ inline Grid_simd operator-(Grid_simd a, Grid_simd b) { return ret; }; +// Distinguish between complex types and others +template = 0> +inline Grid_simd real_mult(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, MultRealPartSIMD()); + return ret; +}; +template = 0> +inline Grid_simd real_madd(Grid_simd a, Grid_simd b, Grid_simd c) { + Grid_simd ret; + ret.v = trinary(a.v, b.v, c.v, MaddRealPartSIMD()); + return ret; +}; + + // Distinguish between complex types and others template = 0> inline Grid_simd operator*(Grid_simd a, Grid_simd b) { diff --git a/lib/simd/Intel512avx.h b/lib/simd/Intel512avx.h index 19157db4..7b5964ad 100644 --- a/lib/simd/Intel512avx.h +++ b/lib/simd/Intel512avx.h @@ -95,10 +95,14 @@ Author: paboyle #define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2 #define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n" -#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" -#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" -#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" -#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n" +#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n" +#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n" +#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n" +#define VBCASTCDUPf(OFF,A,DEST) "vbroadcastsd (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTZDUPf(OFF,A,DEST) "vbroadcastf32x4 (" #OFF "*64 )(" #A ")," #DEST ";\n" +#define VBCASTCDUP(OFF,A,DEST) VBCASTCDUPf(OFF,A,DEST) +#define VBCASTZDUP(OFF,A,DEST) VBCASTZDUPf(OFF,A,DEST) #define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n" #define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n" @@ -106,11 +110,15 @@ Author: paboyle #define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n" +#define VMADDRDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDIDUPf(O,P,B,accum) "vfmadd231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" #define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n" #define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n" +#define VMADDRDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" +#define VMADDIDUPd(O,P,B,accum) "vfmadd231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" #define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n" #define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n" diff --git a/lib/simd/Intel512common.h b/lib/simd/Intel512common.h index cfa20c26..e69e541c 100644 --- a/lib/simd/Intel512common.h +++ b/lib/simd/Intel512common.h @@ -87,7 +87,8 @@ Author: paboyle VACCTIMESMINUSI1d(A,ACC,tmp) \ VACCTIMESMINUSI2d(A,ACC,tmp) -#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A ); +#define LOAD64a(A,ptr) "movq %0, %" #A : : "r"(ptr) : #A +#define LOAD64i(A,ptr) __asm__ ( LOAD64a(A,ptr)); #define LOAD64(A,ptr) LOAD64i(A,ptr) #define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n" @@ -108,8 +109,8 @@ Author: paboyle //"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n" // "clevict0 "#O"*64("#A");\n" -#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n" -#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADf(OFF,PTR,DEST) "vmovups " #OFF "*64(" #PTR "), " #DEST ";\n" +#define VLOADd(OFF,PTR,DEST) "vmovupd " #OFF "*64(" #PTR "), " #DEST ";\n" #define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n" #define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n" @@ -143,8 +144,8 @@ Author: paboyle #define VSTOREf(OFF,PTR,SRC) "vmovntps " #SRC "," #OFF "*64(" #PTR ")" ";\n" #define VSTOREd(OFF,PTR,SRC) "vmovntpd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #else -#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n" -#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREf(OFF,PTR,SRC) "vmovups " #SRC "," #OFF "*64(" #PTR ")" ";\n" +#define VSTOREd(OFF,PTR,SRC) "vmovupd " #SRC "," #OFF "*64(" #PTR ")" ";\n" #endif // Swaps Re/Im ; could unify this with IMCI diff --git a/lib/simd/Intel512double.h b/lib/simd/Intel512double.h index 224c593d..632b5639 100644 --- a/lib/simd/Intel512double.h +++ b/lib/simd/Intel512double.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum) #define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPd(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum) diff --git a/lib/simd/Intel512single.h b/lib/simd/Intel512single.h index 3fa47668..ed135651 100644 --- a/lib/simd/Intel512single.h +++ b/lib/simd/Intel512single.h @@ -144,10 +144,12 @@ Author: paboyle #define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum) #define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum) +#undef VMADDRDUP #undef VMADDSUBRDUP #undef VMADDSUBIDUP #undef VMULRDUP #undef VMULIDUP +#define VMADDRDUP(O,P,B,accum) VMADDRDUPf(O,P,B,accum) #define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum) #define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum) #define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum) diff --git a/scripts/filelist b/scripts/filelist index 1ab95c7c..bf2fbc41 100755 --- a/scripts/filelist +++ b/scripts/filelist @@ -4,9 +4,8 @@ home=`pwd` # library Make.inc cd $home/lib -HFILES=`find . -type f -name '*.h' -not -path '*/Old/*' -not -path '*/Eigen/*'` -HFILES="$HFILES" -CCFILES=`find . -type f -name '*.cc' -not -name '*ommunicator*.cc'` +HFILES=`find . -type f -name '*.h' -not -name '*Hdf5*' -not -path '*/Old/*' -not -path '*/Eigen/*'` +CCFILES=`find . -type f -name '*.cc' -not -name '*Communicator*.cc' -not -name '*Hdf5*'` echo HFILES=$HFILES > Make.inc echo >> Make.inc echo CCFILES=$CCFILES >> Make.inc diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc index e23aa1a3..8204b05b 100644 --- a/tests/IO/Test_serialisation.cc +++ b/tests/IO/Test_serialisation.cc @@ -28,130 +28,152 @@ Author: Peter Boyle /* END LEGAL */ #include -namespace Grid { - - GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3); - - class myclass: Serializable { - public: - - GRID_SERIALIZABLE_CLASS_MEMBERS(myclass, - myenum, e, - std::vector, ve, - std::string, name, - int, x, - double, y, - bool , b, - std::vector, array, - std::vector>, twodimarray, - ); - - myclass() {} - myclass(int i) - : array(4,5.1), twodimarray(3,std::vector(2,1.23456)), ve(2, myenum::blue) - { - e=myenum::red; - x=i; - y=2*i; - b=true; - name="bother said pooh"; - } - }; - -} - using namespace Grid; -int16_t i16 = 1; +GRID_SERIALIZABLE_ENUM(myenum, undef, red, 1, blue, 2, green, 3); + +class myclass: Serializable { +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(myclass, + myenum, e, + std::vector, ve, + std::string, name, + int, x, + double, y, + bool , b, + std::vector, array, + std::vector>, twodimarray, + std::vector>>, cmplx3darray + ); + myclass() {} + myclass(int i) + : array(4,5.1) + , twodimarray(3,std::vector(5, 1.23456)) + , cmplx3darray(3,std::vector>(5, std::vector(7, Complex(1.2, 3.4)))) + , ve(2, myenum::blue) + { + e=myenum::red; + x=i; + y=2*i; + b=true; + name="bother said pooh"; + } +}; + +int16_t i16 = 1; uint16_t u16 = 2; -int32_t i32 = 3; +int32_t i32 = 3; uint32_t u32 = 4; -int64_t i64 = 5; +int64_t i64 = 5; uint64_t u64 = 6; -float f = M_PI; -double d = 2*M_PI; -bool b = false; +float f = M_PI; +double d = 2*M_PI; +bool b = false; + +template +void ioTest(const std::string &filename, const O &object, const std::string &name) +{ + // writer needs to be destroyed so that writing physically happens + { + W writer(filename); + + write(writer, "testobject", object); + } + + R reader(filename); + O buf; + bool good; + + read(reader, "testobject", buf); + good = (object == buf); + std::cout << name << " IO test: " << (good ? "success" : "failure"); + std::cout << std::endl; + if (!good) exit(EXIT_FAILURE); +} int main(int argc,char **argv) { - { - XmlWriter WR("bother.xml"); - - // test basic type writing - push(WR,"BasicTypes"); - write(WR,std::string("i16"),i16); - write(WR,"u16",u16); - write(WR,"i32",i32); - write(WR,"u32",u32); - write(WR,"i64",i64); - write(WR,"u64",u64); - write(WR,"f",f); - write(WR,"d",d); - write(WR,"b",b); - pop(WR); - - // test serializable class writing - myclass obj(1234); // non-trivial constructor - write(WR,"obj",obj); - WR.write("obj2", obj); - std::cout << obj << std::endl; - - std::vector vec; - vec.push_back(myclass(1234)); - vec.push_back(myclass(5678)); - vec.push_back(myclass(3838)); - write(WR, "objvec", vec); - }; + std::cout << "==== basic IO" << std::endl; + XmlWriter WR("bother.xml"); + + // test basic type writing + std::cout << "-- basic writing to 'bother.xml'..." << std::endl; + push(WR,"BasicTypes"); + write(WR,std::string("i16"),i16); + write(WR,"u16",u16); + write(WR,"i32",i32); + write(WR,"u32",u32); + write(WR,"i64",i64); + write(WR,"u64",u64); + write(WR,"f",f); + write(WR,"d",d); + write(WR,"b",b); + pop(WR); + + // test serializable class writing + myclass obj(1234); // non-trivial constructor + std::vector vec; + + std::cout << "-- serialisable class writing to 'bother.xml'..." << std::endl; + write(WR,"obj",obj); + WR.write("obj2", obj); + vec.push_back(myclass(1234)); + vec.push_back(myclass(5678)); + vec.push_back(myclass(3838)); + write(WR, "objvec", vec); + std::cout << "-- serialisable class writing to std::cout:" << std::endl; + std::cout << obj << std::endl; + std::cout << "-- serialisable class comparison:" << std::endl; + std::cout << "vec[0] == obj: " << ((vec[0] == obj) ? "true" : "false") << std::endl; + std::cout << "vec[1] == obj: " << ((vec[1] == obj) ? "true" : "false") << std::endl; // read tests - myclass copy1, copy2, copy3; - std::vector veccopy1, veccopy2, veccopy3; + std::cout << "\n==== IO self-consistency tests" << std::endl; //// XML - { - XmlReader RD("bother.xml"); - read(RD,"obj",copy1); - read(RD,"objvec", veccopy1); - std::cout << "Loaded (XML) -----------------" << std::endl; - std::cout << copy1 << std::endl << veccopy1 << std::endl; - } + ioTest("iotest.xml", obj, "XML (object) "); + ioTest("iotest.xml", vec, "XML (vector of objects)"); //// binary - { - BinaryWriter BWR("bother.bin"); - write(BWR,"discard",copy1 ); - write(BWR,"discard",veccopy1 ); - } - { - BinaryReader BRD("bother.bin"); - read (BRD,"discard",copy2 ); - read (BRD,"discard",veccopy2 ); - std::cout << "Loaded (bin) -----------------" << std::endl; - std::cout << copy2 << std::endl << veccopy2 << std::endl; - } + ioTest("iotest.bin", obj, "binary (object) "); + ioTest("iotest.bin", vec, "binary (vector of objects)"); //// text - { - TextWriter TWR("bother.txt"); - write(TWR,"discard",copy1 ); - write(TWR,"discard",veccopy1 ); - } - { - TextReader TRD("bother.txt"); - read (TRD,"discard",copy3 ); - read (TRD,"discard",veccopy3 ); - std::cout << "Loaded (txt) -----------------" << std::endl; - std::cout << copy3 << std::endl << veccopy3 << std::endl; - } + ioTest("iotest.dat", obj, "text (object) "); + ioTest("iotest.dat", vec, "text (vector of objects)"); + //// HDF5 +#ifdef HAVE_HDF5 + ioTest("iotest.h5", obj, "HDF5 (object) "); + ioTest("iotest.h5", vec, "HDF5 (vector of objects)"); +#endif - std::vector iv = strToVec("1 2 2 4"); - std::vector sv = strToVec("bli bla blu"); + std::cout << "\n==== vector flattening/reconstruction" << std::endl; + typedef std::vector>> vec3d; - for (auto &e: iv) + vec3d dv, buf; + double d = 0.; + + dv.resize(4); + for (auto &v1: dv) { - std::cout << e << " "; + v1.resize(3); + for (auto &v2: v1) + { + v2.resize(5); + for (auto &x: v2) + { + x = d++; + } + } } - std::cout << std::endl; - for (auto &e: sv) - { - std::cout << e << " "; - } - std::cout << std::endl; + std::cout << "original 3D vector:" << std::endl; + std::cout << dv << std::endl; + + Flatten flatdv(dv); + + std::cout << "\ndimensions:" << std::endl; + std::cout << flatdv.getDim() << std::endl; + std::cout << "\nflattened vector:" << std::endl; + std::cout << flatdv.getFlatVector() << std::endl; + + Reconstruct rec(flatdv.getFlatVector(), flatdv.getDim()); + std::cout << "\nreconstructed vector:" << std::endl; + std::cout << flatdv.getVector() << std::endl; } diff --git a/tests/debug/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc similarity index 100% rename from tests/debug/Test_cayley_even_odd_vec.cc rename to tests/Test_cayley_even_odd_vec.cc