1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 22:07:05 +01:00

Hadrons: moving Hadrons to root directory, build system improvements

This commit is contained in:
2018-08-28 15:00:40 +01:00
parent 5f206df775
commit fb7d021b9d
499 changed files with 429 additions and 846 deletions

37
Grid/DisableWarnings.h Normal file
View File

@ -0,0 +1,37 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/DisableWarnings.h
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H
//disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488
#endif

49
Grid/Grid.h Normal file
View File

@ -0,0 +1,49 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Grid.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
//
// Grid.h
// simd
//
// Created by Peter Boyle on 09/05/2014.
// Copyright (c) 2014 University of Edinburgh. All rights reserved.
//
#ifndef GRID_H
#define GRID_H
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/Action.h>
#include <Grid/qcd/utils/GaugeFix.h>
#include <Grid/qcd/smearing/Smearing.h>
#include <Grid/parallelIO/MetaData.h>
#include <Grid/qcd/hmc/HMC_aggregate.h>
#endif

61
Grid/GridCore.h Normal file
View File

@ -0,0 +1,61 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Grid.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
//
// Grid.h
// simd
//
// Created by Peter Boyle on 09/05/2014.
// Copyright (c) 2014 University of Edinburgh. All rights reserved.
//
#ifndef GRID_BASE_H
#define GRID_BASE_H
#include <Grid/GridStd.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h>
#include <Grid/util/Util.h>
#include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h>
#include <Grid/tensors/Tensors.h>
#include <Grid/lattice/Lattice.h>
#include <Grid/cshift/Cshift.h>
#include <Grid/stencil/Stencil.h>
#include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h>
#endif

42
Grid/GridQCDcore.h Normal file
View File

@ -0,0 +1,42 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Grid.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: azusayamaguchi <ayamaguc@YAMAKAZE.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_CORE_H
#define GRID_QCD_CORE_H
/////////////////////////
// Core Grid QCD headers
/////////////////////////
#include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h>
#endif

29
Grid/GridStd.h Normal file
View File

@ -0,0 +1,29 @@
#ifndef GRID_STD_H
#define GRID_STD_H
///////////////////
// Std C++ dependencies
///////////////////
#include <cassert>
#include <complex>
#include <vector>
#include <string>
#include <iostream>
#include <iomanip>
#include <random>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>
#include <sys/time.h>
#include <chrono>
#include <zlib.h>
///////////////////
// Grid config
///////////////////
#include "Config.h"
#endif /* GRID_STD_H */

9
Grid/Grid_Eigen_Dense.h Normal file
View File

@ -0,0 +1,9 @@
#pragma once
#if defined __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
#include <Grid/Eigen/Dense>
#if defined __GNUC__
#pragma GCC diagnostic pop
#endif

37
Grid/Makefile.am Normal file
View File

@ -0,0 +1,37 @@
extra_sources=
extra_headers=
if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_mpi3.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryMPI.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_COMMS_NONE
extra_sources+=communicator/Communicator_none.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryNone.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_HDF5
extra_sources+=serialisation/Hdf5IO.cc
extra_headers+=serialisation/Hdf5IO.h
extra_headers+=serialisation/Hdf5Type.h
endif
#
# Libraries
#
include Make.inc
include Eigen.inc
lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources)
HFILES += $(extra_headers)
libGrid_a_SOURCES = $(CCFILES)
libGrid_adir = $(includedir)/Grid
nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files) Config.h

View File

@ -0,0 +1,61 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Algorithms.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H
#include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h>
#include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h>
#include <Grid/algorithms/approx/Remez.h>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/CoarsenedMatrix.h>
#include <Grid/algorithms/FFT.h>
// EigCg
// Pcg
// Hdcg
// GCR
// etc..
#endif

View File

@ -0,0 +1,480 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/CoarsenedMatrix.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H
namespace Grid {
class Geometry {
// int dimension;
public:
int npoint;
std::vector<int> directions ;
std::vector<int> displacements;
Geometry(int _d) {
int base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
for(int d=0;d<_d;d++){
directions[2*d ] = d+base;
directions[2*d+1] = d+base;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*_d]=0;
displacements[2*_d]=0;
//// report back
std::cout<<GridLogMessage<<"directions :";
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
std::cout <<std::endl;
std::cout<<GridLogMessage<<"displacements :";
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
std::cout<<std::endl;
}
/*
// Original cleaner code
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
for(int d=0;d<dimension;d++){
directions[2*d ] = d;
directions[2*d+1] = d;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*dimension]=0;
displacements[2*dimension]=0;
}
std::vector<int> GetDelta(int point) {
std::vector<int> delta(dimension,0);
delta[directions[point]] = displacements[point];
return delta;
};
*/
};
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal();
}
void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace);
eProj=zero;
parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){
eProj._odata[ss](i)=CComplex(1.0);
}
eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
}
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.checkerboard = subspace[0].checkerboard;
blockPromote(CoarseVec,FineVec,subspace);
}
void CreateSubspaceRandom(GridParallelRNG &RNG){
for(int i=0;i<nbasis;i++){
random(RNG,subspace[i]);
std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
}
Orthogonalise();
}
/*
virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
// Run a Lanczos with sloppy convergence
const int Nstop = nn;
const int Nk = nn+20;
const int Np = nn+20;
const int Nm = Nk+Np;
const int MaxIt= 10000;
RealD resid = 1.0e-3;
Chebyshev<FineField> Cheb(0.5,64.0,21);
ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
// IRL.lock = 1;
FineField noise(FineGrid); gaussian(RNG,noise);
FineField tmp(FineGrid);
std::vector<RealD> eval(Nm);
std::vector<FineField> evec(Nm,FineGrid);
int Nconv;
IRL.calc(eval,evec,
noise,
Nconv);
// pull back nn vectors
for(int b=0;b<nn;b++){
subspace[b] = evec[b];
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
noise = tmp - sqrt(eval[b])*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
noise = tmp + eval[b]*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
}
Orthogonalise();
for(int b=0;b<nn;b++){
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
}
}
*/
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,10000);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
Orthogonalise();
}
};
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
////////////////////
// Data members
////////////////////
Geometry geom;
GridBase * _grid;
CartesianStencil<siteVector,siteVector> Stencil;
std::vector<CoarseMatrix> A;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
RealD M (const CoarseVector &in, CoarseVector &out){
conformable(_grid,in._grid);
conformable(in._grid,out._grid);
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
siteVector res = zero;
siteVector nbr;
int ptype;
StencilEntry *SE;
for(int point=0;point<geom.npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in._odata[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in._odata[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
res = res + A[point]._odata[ss]*nbr;
}
vstream(out._odata[ss],res);
}
return norm2(out);
};
RealD Mdag (const CoarseVector &in, CoarseVector &out){
return M(in,out);
};
// Defer support for further coarsening for now
void Mdiag (const CoarseVector &in, CoarseVector &out){};
void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp){};
CoarsenedMatrix(GridCartesian &CoarseGrid) :
_grid(&CoarseGrid),
geom(CoarseGrid._ndimension),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid)
{
};
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace){
FineField iblock(FineGrid); // contributions from within this block
FineField oblock(FineGrid); // contributions from outwith this block
FineField phi(FineGrid);
FineField tmp(FineGrid);
FineField zz(FineGrid); zz=zero;
FineField Mphi(FineGrid);
Lattice<iScalar<vInteger> > coor(FineGrid);
CoarseVector iProj(Grid());
CoarseVector oProj(Grid());
CoarseScalar InnerProd(Grid());
// Orthogonalise the subblocks over the basis
blockOrthogonalise(InnerProd,Subspace.subspace);
// Compute the matrix elements of linop between this orthonormal
// set of vectors.
int self_stencil=-1;
for(int p=0;p<geom.npoint;p++){
A[p]=zero;
if( geom.displacements[p]==0){
self_stencil=p;
}
}
assert(self_stencil!=-1);
for(int i=0;i<nbasis;i++){
phi=Subspace.subspace[i];
std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
for(int p=0;p<geom.npoint;p++){
int dir = geom.directions[p];
int disp = geom.displacements[p];
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
LatticeCoordinate(coor,dir);
if ( disp==0 ){
linop.OpDiag(phi,Mphi);
}
else {
linop.OpDir(phi,Mphi,dir,disp);
}
////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell
////////////////////////////////////////////////////////////////////////
if ( disp==0 ) {
iblock = Mphi;
oblock = zero;
} else if ( disp==1 ) {
oblock = where(mod(coor,block)==(block-1),Mphi,zz);
iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
} else if ( disp==-1 ) {
oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
} else {
assert(0);
}
Subspace.ProjectToSubspace(iProj,iblock);
Subspace.ProjectToSubspace(oProj,oblock);
// blockProject(iProj,iblock,Subspace.subspace);
// blockProject(oProj,oblock,Subspace.subspace);
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
for(int j=0;j<nbasis;j++){
if( disp!= 0 ) {
A[p]._odata[ss](j,i) = oProj._odata[ss](j);
}
A[self_stencil]._odata[ss](j,i) = A[self_stencil]._odata[ss](j,i) + iProj._odata[ss](j);
}
}
}
}
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< A[p] << std::endl;
}
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
// ForceHermitian();
AssertHermitian();
// ForceDiagonal();
}
void ForceDiagonal(void) {
std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
std::cout<<GridLogMessage<<"**** Forcing coarse operator to be diagonal ****"<<std::endl;
std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
for(int p=0;p<8;p++){
A[p]=zero;
}
GridParallelRNG RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
Complex one(1.0);
iMatrix<CComplex,nbasis> ident; ident=one;
val = val*adj(val);
val = val + 1.0;
A[8] = val*ident;
// for(int s=0;s<Grid()->oSites();s++) {
// A[8]._odata[s]=val._odata[s];
// }
}
void ForceHermitian(void) {
for(int d=0;d<4;d++){
int dd=d+1;
A[2*d] = adj(Cshift(A[2*d+1],dd,1));
}
// A[8] = 0.5*(A[8] + adj(A[8]));
}
void AssertHermitian(void) {
CoarseMatrix AA (Grid());
CoarseMatrix AAc (Grid());
CoarseMatrix Diff (Grid());
for(int d=0;d<4;d++){
int dd=d+1;
AAc = Cshift(A[2*d+1],dd,1);
AA = A[2*d];
Diff = AA - adj(AAc);
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
}
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
}
};
}
#endif

306
Grid/algorithms/FFT.h Normal file
View File

@ -0,0 +1,306 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Cshift.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_FFT_H_
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#ifdef USE_MKL
#include <fftw/fftw3.h>
#else
#include <fftw3.h>
#endif
#endif
namespace Grid {
template<class scalar> struct FFTW { };
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
template<> struct FFTW<ComplexF> {
public:
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
#endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#endif
class FFT {
private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops;
double flops_call;
uint64_t usec;
std::vector<int> dimensions;
std::vector<int> processors;
std::vector<int> processor_coor;
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{
flops=0;
usec =0;
std::vector<int> layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors);
};
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
std::vector<int> mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
}
}
if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L);
}
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
}
}
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
PARALLEL_REGION
{
std::vector<int> clbuf(Nd), cgbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
}
}
result = result*div;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
};
}
#endif

View File

@ -0,0 +1,481 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/LinearOperator.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_LINEAR_OP_H
#define GRID_ALGORITHM_LINEAR_OP_H
namespace Grid {
/////////////////////////////////////////////////////////////////////////////////////////////
// LinearOperators Take a something and return a something.
/////////////////////////////////////////////////////////////////////////////////////////////
//
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
//SBase
// i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast
//
// Would be fun to have a test linearity & Herm Conj function!
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase {
public:
// Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
virtual void HermOp(const Field &in, Field &out)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
// between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
// the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
// replication of code.
//
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes
/////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// Construct herm op from non-herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class MdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
}
void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
};
////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD _shift;
public:
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
assert(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
assert(0);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
assert(0);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
assert(0);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
out = out + _shift*in;
ComplexD dot;
dot= innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
};
////////////////////////////////////////////////////////////////////
// Wrap an already herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class HermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.M(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.M(in,out);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several
// ways to introduce the even odd checkerboarding
//////////////////////////////////////////////////////////
template<class Field>
class SchurOperatorBase : public LinearOperatorBase<Field> {
public:
virtual RealD Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp(in._grid);
tmp.checkerboard = in.checkerboard;
ni=Mpc(in,tmp);
no=MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.checkerboard = in.checkerboard;
MpcDagMpc(in,out,n1,n2);
}
virtual void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
void Op (const Field &in, Field &out){
Mpc(in,out);
}
void AdjOp (const Field &in, Field &out){
MpcDag(in,out);
}
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0); // must coarsen the unpreconditioned system
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
};
template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
tmp.checkerboard = !in.checkerboard;
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.checkerboard << " cb out " << out.checkerboard << std::endl;
_Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid);
_Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeDag(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
};
template<class Matrix,class Field>
class SchurDiagOneOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
_Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp);
_Mat.Meooe(tmp,out);
_Mat.MooeeInv(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid);
_Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
};
template<class Matrix,class Field>
class SchurDiagTwoOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
_Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid);
_Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp);
_Mat.MeooeDag(tmp,out);
_Mat.MooeeInvDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Staggered use
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
Field tmp;
RealD mass;
double tMpc;
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public:
void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ncall++;
tMpc-=usecond();
n2 = Mpc(in,out);
tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out);
tIP+=usecond();
n1 = real(dot);
}
virtual void HermOp(const Field &in, Field &out){
ncall++;
tMpc-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
}
virtual RealD Mpc (const Field &in, Field &out) {
tMeo-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMeo+=usecond();
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
}
virtual RealD MpcDag (const Field &in, Field &out){
return Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered
}
};
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
/////////////////////////////////////////////////////////////
// Base classes for functions of operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
};
template<class Field> class LinearFunction {
public:
virtual void operator() (const Field &in, Field &out) = 0;
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
public:
void operator() (const Field &in, Field &out){
out = in;
};
};
/////////////////////////////////////////////////////////////
// Base classes for Multishift solvers for operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorMultiFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
};
// FIXME : To think about
// Chroma functionality list defining LinearOperator
/*
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
virtual const Subset& subset() const = 0;
virtual unsigned long nFlops() const { return 0; }
virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
const Subset& subset() const {return all;}
};
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Hermitian operator Linear function and operator function
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
{}
void operator()(const Field& in, Field& out) {
_Linop.HermOp(in,out);
}
};
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
: _poly(poly), _Linop(linop) {};
void operator()(const Field& in, Field& out) {
_poly(_Linop,in,out);
}
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in._grid);
Field Mtmp(in._grid);
AtoN = in;
out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
}
};
};
}
#endif

View File

@ -0,0 +1,46 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/Preconditioner.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PRECONDITIONER_H
#define GRID_PRECONDITIONER_H
namespace Grid {
template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0;
};
template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public:
void operator()(const Field &src, Field & psi){
psi = src;
}
TrivialPrecon(void){};
};
}
#endif

View File

@ -0,0 +1,71 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/SparseMatrix.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_SPARSE_MATRIX_H
#define GRID_ALGORITHM_SPARSE_MATRIX_H
namespace Grid {
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SparseMatrixBase {
public:
virtual GridBase *Grid(void) =0;
// Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in._grid);
ni=M(in,tmp);
no=Mdag(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface augmented by a red black sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public:
virtual GridBase *RedBlackGrid(void)=0;
// half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0;
virtual void MooeeInv (const Field &in, Field &out)=0;
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
};
}
#endif

View File

@ -0,0 +1,377 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/Chebyshev.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <clehner@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CHEBYSHEV_H
#define GRID_CHEBYSHEV_H
#include <Grid/algorithms/LinearOperator.h>
namespace Grid {
struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
RealD, alpha,
RealD, beta,
int, Npoly);
};
////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class Chebyshev : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD hi;
RealD lo;
public:
void csv(std::ostream &out){
RealD diff = hi-lo;
RealD delta = (hi-lo)*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
// Convenience for plotting the approximation
void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
////////////////////////////////////////////////////////////////////////////////////////////////////
// CJ: the one we need for Lanczos
void Init(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
// std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
T1=y*xscale+in*mscale;
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
alpha(_alpha),
beta(_beta),
mu(_mu)
{
order=_order;
Coeffs.resize(order);
for(int i=0;i<_order;i++){
Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in._grid;
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
}
#endif

View File

@ -0,0 +1,152 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/Forecast.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H
namespace Grid {
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = zero; return chi; }
else if(degree == 1){ return prev_solns[0]; }
RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = std::conj(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = zero;
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = std::conj(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
}
#endif

View File

@ -0,0 +1,21 @@
Copyright (c) 2011 Michael Clark
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -0,0 +1,56 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/MultiShiftFunction.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
double MultiShiftFunction::approx(double x)
{
double a = norm;
for(int n=0;n<poles.size();n++){
a = a + residues[n]/(x+poles[n]);
}
return a;
}
void MultiShiftFunction::gnuplot(std::ostream &out)
{
out<<"f(x) = "<<norm<<"";
for(int n=0;n<poles.size();n++){
out<<"+("<<residues[n]<<"/(x+"<<poles[n]<<"))";
}
out<<";"<<std::endl;
}
void MultiShiftFunction::csv(std::ostream &out)
{
for (double x=lo; x<hi; x*=1.05) {
double f = approx(x);
double r = sqrt(x);
out<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
}
return;
}
}

View File

@ -0,0 +1,67 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/MultiShiftFunction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef MULTI_SHIFT_FUNCTION
#define MULTI_SHIFT_FUNCTION
namespace Grid {
class MultiShiftFunction {
public:
int order;
std::vector<RealD> poles;
std::vector<RealD> residues;
std::vector<RealD> tolerances;
RealD norm;
RealD lo,hi;
MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
RealD approx(RealD x);
void csv(std::ostream &out);
void gnuplot(std::ostream &out);
void Init(AlgRemez & remez,double tol,bool inverse)
{
order=remez.getDegree();
tolerances.resize(remez.getDegree(),tol);
poles.resize(remez.getDegree());
residues.resize(remez.getDegree());
remez.getBounds(lo,hi);
if ( inverse ) remez.getIPFE (&residues[0],&poles[0],&norm);
else remez.getPFE (&residues[0],&poles[0],&norm);
}
// Allow deferred initialisation
MultiShiftFunction(void){};
MultiShiftFunction(AlgRemez & remez,double tol,bool inverse)
{
Init(remez,tol,inverse);
}
};
}
#endif

View File

@ -0,0 +1,80 @@
-----------------------------------------------------------------------------------
PAB. Took Mike Clark's AlgRemez from GitHub and (modified a little) include.
This is open source and license and readme and comments are preserved consistent
with the license. Mike, thankyou!
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
AlgRemez
The archive downloadable here contains an implementation of the Remez
algorithm which calculates optimal rational (and polynomial)
approximations to the nth root over a given spectral range. The Remez
algorithm, although in principle is extremely straightforward to
program, is quite difficult to get completely correct, e.g., the Maple
implementation of the algorithm does not always converge to the
correct answer.
To use this algorithm you need to install GMP, the GNU Multiple
Precision Library, and when configuring the install, you must include
the --enable-mpfr option (see the GMP manual for more details). You
also have to edit the Makefile for AlgRemez appropriately for your
system, namely to point corrrectly to the location of the GMP library.
The simple main program included with this archive invokes the
AlgRemez class to calculate an approximation given by command line
arguments. It is invoked by the following
./test y z n d lambda_low lambda_high precision,
where the function to be approximated is f(x) = x^(y/z), with degree
(n,d) over the spectral range [lambda_low, lambda_high], using
precision digits of precision in the arithmetic. So an example would
be
./test 1 2 5 5 0.0004 64 40
which corresponds to constructing a rational approximation to the
square root function, with degree (5,5) over the range [0.0004,64]
with 40 digits of precision used for the arithmetic. The parameters y
and z must be positive, the approximation to f(x) = x^(-y/z) is simply
the inverse of the approximation to f(x) = x^(y/z). After the
approximation has been constructed, the roots and poles of the
rational function are found, and then the partial fraction expansion
of both the rational function and it's inverse are found, the results
of which are output to a file called "approx.dat". In addition, the
error function of the approximation is output to "error.dat", where it
can be checked that the resultant approximation satisfies Chebychev's
criterion, namely all error maxima are equal in magnitude, and
adjacent maxima are oppostie in sign. There are some caveats here
however, the optimal polynomial approximation has complex roots, and
the root finding implemented here cannot (yet) handle complex roots.
In addition, the partial fraction expansion of rational approximations
is only found for the case n = d, i.e., the degree of numerator
polynomial equals that of the denominator polynomial. The convention
for the partial fraction expansion is that polar shifts are always
written added to x, not subtracted.
To do list
1. Include an exponential dampening factor in the function to be
approximated. This may sound trivial to implement, but for some
parameters, the algorithm seems to breakdown. Also, the roots in the
rational approximation sometimes become complex, which currently
breaks the stupidly simple root finding code.
2. Make the algorithm faster - it's too slow when running on qcdoc.
3. Add complex root finding.
4. Add more options for error minimisation - currently the code
minimises the relative error, should add options for absolute error,
and other norms.
There will be a forthcoming publication concerning the results
generated by this software, but in the meantime, if you use this
software, please cite it as
"M.A. Clark and A.D. Kennedy, https://github.com/mikeaclark/AlgRemez, 2005".
If you have any problems using the software, then please email scientist.mike@gmail.com.

View File

@ -0,0 +1,760 @@
/*
Mike Clark - 25th May 2005
alg_remez.C
AlgRemez is an implementation of the Remez algorithm, which in this
case is used for generating the optimal nth root rational
approximation.
Note this class requires the gnu multiprecision (GNU MP) library.
*/
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
#include<string>
#include<iostream>
#include<iomanip>
#include<cassert>
#include<Grid/algorithms/approx/Remez.h>
// Constructor
AlgRemez::AlgRemez(double lower, double upper, long precision)
{
prec = precision;
bigfloat::setDefaultPrecision(prec);
apstrt = lower;
apend = upper;
apwidt = apend - apstrt;
std::cout<<"Approximation bounds are ["<<apstrt<<","<<apend<<"]\n";
std::cout<<"Precision of arithmetic is "<<precision<<std::endl;
alloc = 0;
n = 0;
d = 0;
foundRoots = 0;
// Only require the approximation spread to be less than 1 ulp
tolerance = 1e-15;
}
// Destructor
AlgRemez::~AlgRemez()
{
if (alloc) {
delete [] param;
delete [] roots;
delete [] poles;
delete [] xx;
delete [] mm;
delete [] a_power;
delete [] a;
}
}
// Free memory and reallocate as necessary
void AlgRemez::allocate(int num_degree, int den_degree)
{
// Arrays have previously been allocated, deallocate first, then allocate
if (alloc) {
delete [] param;
delete [] roots;
delete [] poles;
delete [] xx;
delete [] mm;
}
// Note use of new and delete in memory allocation - cannot run on qcdsp
param = new bigfloat[num_degree+den_degree+1];
roots = new bigfloat[num_degree];
poles = new bigfloat[den_degree];
xx = new bigfloat[num_degree+den_degree+3];
mm = new bigfloat[num_degree+den_degree+2];
if (!alloc) {
// The coefficients of the sum in the exponential
a = new bigfloat[SUM_MAX];
a_power = new int[SUM_MAX];
}
alloc = 1;
}
// Reset the bounds of the approximation
void AlgRemez::setBounds(double lower, double upper)
{
apstrt = lower;
apend = upper;
apwidt = apend - apstrt;
}
// Generate the rational approximation x^(pnum/pden)
double AlgRemez::generateApprox(int degree, unsigned long pnum,
unsigned long pden)
{
return generateApprox(degree, degree, pnum, pden);
}
double AlgRemez::generateApprox(int num_degree, int den_degree,
unsigned long pnum, unsigned long pden)
{
double *a_param = 0;
int *a_pow = 0;
return generateApprox(num_degree, den_degree, pnum, pden, 0, a_param, a_pow);
}
// Generate the rational approximation x^(pnum/pden)
double AlgRemez::generateApprox(int num_degree, int den_degree,
unsigned long pnum, unsigned long pden,
int a_len, double *a_param, int *a_pow)
{
std::cout<<"Degree of the approximation is ("<<num_degree<<","<<den_degree<<")\n";
std::cout<<"Approximating the function x^("<<pnum<<"/"<<pden<<")\n";
// Reallocate arrays, since degree has changed
if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
assert(a_len<=SUM_MAX);
step = new bigfloat[num_degree+den_degree+2];
a_length = a_len;
for (int j=0; j<a_len; j++) {
a[j]= a_param[j];
a_power[j] = a_pow[j];
}
power_num = pnum;
power_den = pden;
spread = 1.0e37;
iter = 0;
n = num_degree;
d = den_degree;
neq = n + d + 1;
initialGuess();
stpini(step);
while (spread > tolerance) { //iterate until convergance
if (iter++%100==0)
std::cout<<"Iteration " <<iter-1<<" spread "<<(double)spread<<" delta "<<(double)delta<<std::endl;
equations();
if (delta < tolerance) {
std::cout<<"Delta too small, try increasing precision\n";
assert(0);
};
assert( delta>= tolerance);
search(step);
}
int sign;
double error = (double)getErr(mm[0],&sign);
std::cout<<"Converged at "<<iter<<" iterations; error = "<<error<<std::endl;
// Once the approximation has been generated, calculate the roots
if(!root()) {
std::cout<<"Root finding failed\n";
} else {
foundRoots = 1;
}
delete [] step;
// Return the maximum error in the approximation
return error;
}
// Return the partial fraction expansion of the approximation x^(pnum/pden)
int AlgRemez::getPFE(double *Res, double *Pole, double *Norm) {
if (n!=d) {
std::cout<<"Cannot handle case: Numerator degree neq Denominator degree\n";
return 0;
}
if (!alloc) {
std::cout<<"Approximation not yet generated\n";
return 0;
}
if (!foundRoots) {
std::cout<<"Roots not found, so PFE cannot be taken\n";
return 0;
}
bigfloat *r = new bigfloat[n];
bigfloat *p = new bigfloat[d];
for (int i=0; i<n; i++) r[i] = roots[i];
for (int i=0; i<d; i++) p[i] = poles[i];
// Perform a partial fraction expansion
pfe(r, p, norm);
// Convert to double and return
*Norm = (double)norm;
for (int i=0; i<n; i++) Res[i] = (double)r[i];
for (int i=0; i<d; i++) Pole[i] = (double)p[i];
delete [] r;
delete [] p;
// Where the smallest shift is located
return 0;
}
// Return the partial fraction expansion of the approximation x^(-pnum/pden)
int AlgRemez::getIPFE(double *Res, double *Pole, double *Norm) {
if (n!=d) {
std::cout<<"Cannot handle case: Numerator degree neq Denominator degree\n";
return 0;
}
if (!alloc) {
std::cout<<"Approximation not yet generated\n";
return 0;
}
if (!foundRoots) {
std::cout<<"Roots not found, so PFE cannot be taken\n";
return 0;
}
bigfloat *r = new bigfloat[d];
bigfloat *p = new bigfloat[n];
// Want the inverse function
for (int i=0; i<n; i++) {
r[i] = poles[i];
p[i] = roots[i];
}
// Perform a partial fraction expansion
pfe(r, p, (bigfloat)1l/norm);
// Convert to double and return
*Norm = (double)((bigfloat)1l/(norm));
for (int i=0; i<n; i++) {
Res[i] = (double)r[i];
Pole[i] = (double)p[i];
}
delete [] r;
delete [] p;
// Where the smallest shift is located
return 0;
}
// Initial values of maximal and minimal errors
void AlgRemez::initialGuess() {
// Supply initial guesses for solution points
long ncheb = neq; // Degree of Chebyshev error estimate
bigfloat a, r;
// Find ncheb+1 extrema of Chebyshev polynomial
a = ncheb;
mm[0] = apstrt;
for (long i = 1; i < ncheb; i++) {
r = 0.5 * (1 - cos((M_PI * i)/(double) a));
//r *= sqrt_bf(r);
r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
mm[i] = apstrt + r * apwidt;
}
mm[ncheb] = apend;
a = 2.0 * ncheb;
for (long i = 0; i <= ncheb; i++) {
r = 0.5 * (1 - cos(M_PI * (2*i+1)/(double) a));
//r *= sqrt_bf(r); // Squeeze to low end of interval
r = (exp((double)r)-1.0)/(exp(1.0)-1.0);
xx[i] = apstrt + r * apwidt;
}
}
// Initialise step sizes
void AlgRemez::stpini(bigfloat *step) {
xx[neq+1] = apend;
delta = 0.25;
step[0] = xx[0] - apstrt;
for (int i = 1; i < neq; i++) step[i] = xx[i] - xx[i-1];
step[neq] = step[neq-1];
}
// Search for error maxima and minima
void AlgRemez::search(bigfloat *step) {
bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
int i, j, meq, emsign, ensign, steps;
meq = neq + 1;
bigfloat *yy = new bigfloat[meq];
bigfloat eclose = 1.0e30;
bigfloat farther = 0l;
j = 1;
xx0 = apstrt;
for (i = 0; i < meq; i++) {
steps = 0;
xx1 = xx[i]; // Next zero
if (i == meq-1) xx1 = apend;
xm = mm[i];
ym = getErr(xm,&emsign);
q = step[i];
xn = xm + q;
if (xn < xx0 || xn >= xx1) { // Cannot skip over adjacent boundaries
q = -q;
xn = xm;
yn = ym;
ensign = emsign;
} else {
yn = getErr(xn,&ensign);
if (yn < ym) {
q = -q;
xn = xm;
yn = ym;
ensign = emsign;
}
}
while(yn >= ym) { // March until error becomes smaller.
if (++steps > 10) break;
ym = yn;
xm = xn;
emsign = ensign;
a = xm + q;
if (a == xm || a <= xx0 || a >= xx1) break;// Must not skip over the zeros either side.
xn = a;
yn = getErr(xn,&ensign);
}
mm[i] = xm; // Position of maximum
yy[i] = ym; // Value of maximum
if (eclose > ym) eclose = ym;
if (farther < ym) farther = ym;
xx0 = xx1; // Walk to next zero.
} // end of search loop
q = (farther - eclose); // Decrease step size if error spread increased
if (eclose != 0.0) q /= eclose; // Relative error spread
if (q >= spread) delta *= 0.5; // Spread is increasing; decrease step size
spread = q;
for (i = 0; i < neq; i++) {
q = yy[i+1];
if (q != 0.0) q = yy[i] / q - (bigfloat)1l;
else q = 0.0625;
if (q > (bigfloat)0.25) q = 0.25;
q *= mm[i+1] - mm[i];
step[i] = q * delta;
}
step[neq] = step[neq-1];
for (i = 0; i < neq; i++) { // Insert new locations for the zeros.
xm = xx[i] - step[i];
if (xm <= apstrt) continue;
if (xm >= apend) continue;
if (xm <= mm[i]) xm = (bigfloat)0.5 * (mm[i] + xx[i]);
if (xm >= mm[i+1]) xm = (bigfloat)0.5 * (mm[i+1] + xx[i]);
xx[i] = xm;
}
delete [] yy;
}
// Solve the equations
void AlgRemez::equations(void) {
bigfloat x, y, z;
int i, j, ip;
bigfloat *aa;
bigfloat *AA = new bigfloat[(neq)*(neq)];
bigfloat *BB = new bigfloat[neq];
for (i = 0; i < neq; i++) { // set up the equations for solution by simq()
ip = neq * i; // offset to 1st element of this row of matrix
x = xx[i]; // the guess for this row
y = func(x); // right-hand-side vector
z = (bigfloat)1l;
aa = AA+ip;
for (j = 0; j <= n; j++) {
*aa++ = z;
z *= x;
}
z = (bigfloat)1l;
for (j = 0; j < d; j++) {
*aa++ = -y * z;
z *= x;
}
BB[i] = y * z; // Right hand side vector
}
// Solve the simultaneous linear equations.
if (simq(AA, BB, param, neq)) {
std::cout<<"simq failed\n";
exit(0);
}
delete [] AA;
delete [] BB;
}
// Evaluate the rational form P(x)/Q(x) using coefficients
// from the solution vector param
bigfloat AlgRemez::approx(const bigfloat x) {
bigfloat yn, yd;
int i;
// Work backwards toward the constant term.
yn = param[n]; // Highest order numerator coefficient
for (i = n-1; i >= 0; i--) yn = x * yn + param[i];
yd = x + param[n+d]; // Highest degree coefficient = 1.0
for (i = n+d-1; i > n; i--) yd = x * yd + param[i];
return(yn/yd);
}
// Compute size and sign of the approximation error at x
bigfloat AlgRemez::getErr(bigfloat x, int *sign) {
bigfloat e, f;
f = func(x);
e = approx(x) - f;
if (f != 0) e /= f;
if (e < (bigfloat)0.0) {
*sign = -1;
e = -e;
}
else *sign = 1;
return(e);
}
// Calculate function required for the approximation.
bigfloat AlgRemez::func(const bigfloat x) {
bigfloat z = (bigfloat)power_num / (bigfloat)power_den;
bigfloat y;
if (x == (bigfloat)1.0) y = (bigfloat)1.0;
else y = pow_bf(x,z);
if (a_length > 0) {
bigfloat sum = 0l;
for (int j=0; j<a_length; j++) sum += a[j]*pow_bf(x,a_power[j]);
return y * exp_bf(sum);
} else {
return y;
}
}
// Solve the system AX=B
int AlgRemez::simq(bigfloat A[], bigfloat B[], bigfloat X[], int n) {
int i, j, ij, ip, ipj, ipk, ipn;
int idxpiv, iback;
int k, kp, kp1, kpk, kpn;
int nip, nkp, nm1;
bigfloat em, q, rownrm, big, size, pivot, sum;
bigfloat *aa;
// simq() work vector
int *IPS = new int[(neq) * sizeof(int)];
nm1 = n - 1;
// Initialize IPS and X
ij = 0;
for (i = 0; i < n; i++) {
IPS[i] = i;
rownrm = 0.0;
for(j = 0; j < n; j++) {
q = abs_bf(A[ij]);
if(rownrm < q) rownrm = q;
++ij;
}
if (rownrm == (bigfloat)0l) {
std::cout<<"simq rownrm=0\n";
delete [] IPS;
return(1);
}
X[i] = (bigfloat)1.0 / rownrm;
}
for (k = 0; k < nm1; k++) {
big = 0.0;
idxpiv = 0;
for (i = k; i < n; i++) {
ip = IPS[i];
ipk = n*ip + k;
size = abs_bf(A[ipk]) * X[ip];
if (size > big) {
big = size;
idxpiv = i;
}
}
if (big == (bigfloat)0l) {
std::cout<<"simq big=0\n";
delete [] IPS;
return(2);
}
if (idxpiv != k) {
j = IPS[k];
IPS[k] = IPS[idxpiv];
IPS[idxpiv] = j;
}
kp = IPS[k];
kpk = n*kp + k;
pivot = A[kpk];
kp1 = k+1;
for (i = kp1; i < n; i++) {
ip = IPS[i];
ipk = n*ip + k;
em = -A[ipk] / pivot;
A[ipk] = -em;
nip = n*ip;
nkp = n*kp;
aa = A+nkp+kp1;
for (j = kp1; j < n; j++) {
ipj = nip + j;
A[ipj] = A[ipj] + em * *aa++;
}
}
}
kpn = n * IPS[n-1] + n - 1; // last element of IPS[n] th row
if (A[kpn] == (bigfloat)0l) {
std::cout<<"simq A[kpn]=0\n";
delete [] IPS;
return(3);
}
ip = IPS[0];
X[0] = B[ip];
for (i = 1; i < n; i++) {
ip = IPS[i];
ipj = n * ip;
sum = 0.0;
for (j = 0; j < i; j++) {
sum += A[ipj] * X[j];
++ipj;
}
X[i] = B[ip] - sum;
}
ipn = n * IPS[n-1] + n - 1;
X[n-1] = X[n-1] / A[ipn];
for (iback = 1; iback < n; iback++) {
//i goes (n-1),...,1
i = nm1 - iback;
ip = IPS[i];
nip = n*ip;
sum = 0.0;
aa = A+nip+i+1;
for (j= i + 1; j < n; j++)
sum += *aa++ * X[j];
X[i] = (X[i] - sum) / A[nip+i];
}
delete [] IPS;
return(0);
}
// Calculate the roots of the approximation
int AlgRemez::root() {
long i,j;
bigfloat x,dx=0.05;
bigfloat upper=1, lower=-100000;
bigfloat tol = 1e-20;
bigfloat *poly = new bigfloat[neq+1];
// First find the numerator roots
for (i=0; i<=n; i++) poly[i] = param[i];
for (i=n-1; i>=0; i--) {
roots[i] = rtnewt(poly,i+1,lower,upper,tol);
if (roots[i] == 0.0) {
std::cout<<"Failure to converge on root "<<i+1<<"/"<<n<<"\n";
return 0;
}
poly[0] = -poly[0]/roots[i];
for (j=1; j<=i; j++) poly[j] = (poly[j-1] - poly[j])/roots[i];
}
// Now find the denominator roots
poly[d] = 1l;
for (i=0; i<d; i++) poly[i] = param[n+1+i];
for (i=d-1; i>=0; i--) {
poles[i]=rtnewt(poly,i+1,lower,upper,tol);
if (poles[i] == 0.0) {
std::cout<<"Failure to converge on pole "<<i+1<<"/"<<d<<"\n";
return 0;
}
poly[0] = -poly[0]/poles[i];
for (j=1; j<=i; j++) poly[j] = (poly[j-1] - poly[j])/poles[i];
}
norm = param[n];
delete [] poly;
return 1;
}
// Evaluate the polynomial
bigfloat AlgRemez::polyEval(bigfloat x, bigfloat *poly, long size) {
bigfloat f = poly[size];
for (int i=size-1; i>=0; i--) f = f*x + poly[i];
return f;
}
// Evaluate the differential of the polynomial
bigfloat AlgRemez::polyDiff(bigfloat x, bigfloat *poly, long size) {
bigfloat df = (bigfloat)size*poly[size];
for (int i=size-1; i>0; i--) df = df*x + (bigfloat)i*poly[i];
return df;
}
// Newton's method to calculate roots
bigfloat AlgRemez::rtnewt(bigfloat *poly, long i, bigfloat x1,
bigfloat x2, bigfloat xacc) {
int j;
bigfloat df, dx, f, rtn;
rtn=(bigfloat)0.5*(x1+x2);
for (j=1; j<=JMAX;j++) {
f = polyEval(rtn, poly, i);
df = polyDiff(rtn, poly, i);
dx = f/df;
rtn -= dx;
if (abs_bf(dx) < xacc) return rtn;
}
std::cout<<"Maximum number of iterations exceeded in rtnewt\n";
return 0.0;
}
// Evaluate the partial fraction expansion of the rational function
// with res roots and poles poles. Result is overwritten on input
// arrays.
void AlgRemez::pfe(bigfloat *res, bigfloat *poles, bigfloat norm) {
int i,j,small;
bigfloat temp;
bigfloat *numerator = new bigfloat[n];
bigfloat *denominator = new bigfloat[d];
// Construct the polynomials explicitly
for (i=1; i<n; i++) {
numerator[i] = 0l;
denominator[i] = 0l;
}
numerator[0]=1l;
denominator[0]=1l;
for (j=0; j<n; j++) {
for (i=n-1; i>=0; i--) {
numerator[i] *= -res[j];
denominator[i] *= -poles[j];
if (i>0) {
numerator[i] += numerator[i-1];
denominator[i] += denominator[i-1];
}
}
}
// Convert to proper fraction form.
// Fraction is now in the form 1 + n/d, where O(n)+1=O(d)
for (i=0; i<n; i++) numerator[i] -= denominator[i];
// Find the residues of the partial fraction expansion and absorb the
// coefficients.
for (i=0; i<n; i++) {
res[i] = 0l;
for (j=n-1; j>=0; j--) {
res[i] = poles[i]*res[i]+numerator[j];
}
for (j=n-1; j>=0; j--) {
if (i!=j) res[i] /= poles[i]-poles[j];
}
res[i] *= norm;
}
// res now holds the residues
j = 0;
for (i=0; i<n; i++) poles[i] = -poles[i];
// Move the ordering of the poles from smallest to largest
for (j=0; j<n; j++) {
small = j;
for (i=j+1; i<n; i++) {
if (poles[i] < poles[small]) small = i;
}
if (small != j) {
temp = poles[small];
poles[small] = poles[j];
poles[j] = temp;
temp = res[small];
res[small] = res[j];
res[j] = temp;
}
}
delete [] numerator;
delete [] denominator;
}
double AlgRemez::evaluateApprox(double x) {
return (double)approx((bigfloat)x);
}
double AlgRemez::evaluateInverseApprox(double x) {
return 1.0/(double)approx((bigfloat)x);
}
double AlgRemez::evaluateFunc(double x) {
return (double)func((bigfloat)x);
}
double AlgRemez::evaluateInverseFunc(double x) {
return 1.0/(double)func((bigfloat)x);
}
void AlgRemez::csv(std::ostream & os)
{
double lambda_low = apstrt;
double lambda_high= apend;
for (double x=lambda_low; x<lambda_high; x*=1.05) {
double f = evaluateFunc(x);
double r = evaluateApprox(x);
os<< x<<","<<r<<","<<f<<","<<r-f<<std::endl;
}
return;
}

View File

@ -0,0 +1,184 @@
/*
Mike Clark - 25th May 2005
alg_remez.h
AlgRemez is an implementation of the Remez algorithm, which in this
case is used for generating the optimal nth root rational
approximation.
Note this class requires the gnu multiprecision (GNU MP) library.
*/
#ifndef INCLUDED_ALG_REMEZ_H
#define INCLUDED_ALG_REMEZ_H
#include <stddef.h>
#include <Grid/GridStd.h>
#ifdef HAVE_LIBGMP
#include "bigfloat.h"
#else
#include "bigfloat_double.h"
#endif
#define JMAX 10000 //Maximum number of iterations of Newton's approximation
#define SUM_MAX 10 // Maximum number of terms in exponential
/*
*Usage examples
AlgRemez remez(lambda_low,lambda_high,precision);
error = remez.generateApprox(n,d,y,z);
remez.getPFE(res,pole,&norm);
remez.getIPFE(res,pole,&norm);
remez.csv(ostream &os);
*/
class AlgRemez
{
private:
char *cname;
// The approximation parameters
bigfloat *param, *roots, *poles;
bigfloat norm;
// The numerator and denominator degree (n=d)
int n, d;
// The bounds of the approximation
bigfloat apstrt, apwidt, apend;
// the numerator and denominator of the power we are approximating
unsigned long power_num;
unsigned long power_den;
// Flag to determine whether the arrays have been allocated
int alloc;
// Flag to determine whether the roots have been found
int foundRoots;
// Variables used to calculate the approximation
int nd1, iter;
bigfloat *xx, *mm, *step;
bigfloat delta, spread, tolerance;
// The exponential summation coefficients
bigfloat *a;
int *a_power;
int a_length;
// The number of equations we must solve at each iteration (n+d+1)
int neq;
// The precision of the GNU MP library
long prec;
// Initial values of maximal and minmal errors
void initialGuess();
// Solve the equations
void equations();
// Search for error maxima and minima
void search(bigfloat *step);
// Initialise step sizes
void stpini(bigfloat *step);
// Calculate the roots of the approximation
int root();
// Evaluate the polynomial
bigfloat polyEval(bigfloat x, bigfloat *poly, long size);
//complex_bf polyEval(complex_bf x, complex_bf *poly, long size);
// Evaluate the differential of the polynomial
bigfloat polyDiff(bigfloat x, bigfloat *poly, long size);
//complex_bf polyDiff(complex_bf x, complex_bf *poly, long size);
// Newton's method to calculate roots
bigfloat rtnewt(bigfloat *poly, long i, bigfloat x1, bigfloat x2, bigfloat xacc);
//complex_bf rtnewt(complex_bf *poly, long i, bigfloat x1, bigfloat x2, bigfloat xacc);
// Evaluate the partial fraction expansion of the rational function
// with res roots and poles poles. Result is overwritten on input
// arrays.
void pfe(bigfloat *res, bigfloat* poles, bigfloat norm);
// Calculate function required for the approximation
bigfloat func(bigfloat x);
// Compute size and sign of the approximation error at x
bigfloat getErr(bigfloat x, int *sign);
// Solve the system AX=B
int simq(bigfloat *A, bigfloat *B, bigfloat *X, int n);
// Free memory and reallocate as necessary
void allocate(int num_degree, int den_degree);
// Evaluate the rational form P(x)/Q(x) using coefficients from the
// solution vector param
bigfloat approx(bigfloat x);
public:
// Constructor
AlgRemez(double lower, double upper, long prec);
// Destructor
virtual ~AlgRemez();
int getDegree(void){
assert(n==d);
return n;
}
// Reset the bounds of the approximation
void setBounds(double lower, double upper);
// Reset the bounds of the approximation
void getBounds(double &lower, double &upper) {
lower=(double)apstrt;
upper=(double)apend;
}
// Generate the rational approximation x^(pnum/pden)
double generateApprox(int num_degree, int den_degree,
unsigned long power_num, unsigned long power_den,
int a_len, double* a_param, int* a_pow);
double generateApprox(int num_degree, int den_degree,
unsigned long power_num, unsigned long power_den);
double generateApprox(int degree, unsigned long power_num,
unsigned long power_den);
// Return the partial fraction expansion of the approximation x^(pnum/pden)
int getPFE(double *res, double *pole, double *norm);
// Return the partial fraction expansion of the approximation x^(-pnum/pden)
int getIPFE(double *res, double *pole, double *norm);
// Evaluate the rational form P(x)/Q(x) using coefficients from the
// solution vector param
double evaluateApprox(double x);
// Evaluate the rational form Q(x)/P(x) using coefficients from the
// solution vector param
double evaluateInverseApprox(double x);
// Calculate function required for the approximation
double evaluateFunc(double x);
// Calculate inverse function required for the approximation
double evaluateInverseFunc(double x);
// Dump csv of function, approx and error
void csv(std::ostream &os);
};
#endif // Include guard

View File

@ -0,0 +1,727 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; compile-command: "gcc -o zolotarev zolotarev.c -ansi -pedantic -lm -DTEST"; -*- */
#define VERSION Source Time-stamp: <2015-05-18 16:32:08 neo>
/* These C routines evalute the optimal rational approximation to the signum
* function for epsilon < |x| < 1 using Zolotarev's theorem.
*
* To obtain reliable results for high degree approximations (large n) it is
* necessary to compute using sufficiently high precision arithmetic. To this
* end the code has been parameterised to work with the preprocessor names
* INTERNAL_PRECISION and PRECISION set to float, double, or long double as
* appropriate. INTERNAL_PRECISION is used in computing the Zolotarev
* coefficients, which are converted to PRECISION before being returned to the
* caller. Presumably even higher precision could be obtained using GMP or
* similar package, but bear in mind that rounding errors might also be
* significant in evaluating the resulting polynomial. The convergence criteria
* have been written in a precision-independent form. */
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#ifndef INTERNAL_PRECISION
#define INTERNAL_PRECISION double
#endif
#include "Zolotarev.h"
#define ZOLOTAREV_INTERNAL
#undef ZOLOTAREV_DATA
#define ZOLOTAREV_DATA izd
#undef ZPRECISION
#define ZPRECISION INTERNAL_PRECISION
#include "Zolotarev.h"
#undef ZOLOTAREV_INTERNAL
/* The ANSI standard appears not to know what pi is */
#ifndef M_PI
#define M_PI ((INTERNAL_PRECISION) 3.141592653589793238462643383279502884197\
169399375105820974944592307816406286208998628034825342117068)
#endif
#define ZERO ((INTERNAL_PRECISION) 0)
#define ONE ((INTERNAL_PRECISION) 1)
#define TWO ((INTERNAL_PRECISION) 2)
#define THREE ((INTERNAL_PRECISION) 3)
#define FOUR ((INTERNAL_PRECISION) 4)
#define HALF (ONE/TWO)
/* The following obscenity seems to be the simplest (?) way to coerce the C
* preprocessor to convert the value of a preprocessor token into a string. */
#define PP2(x) #x
#define PP1(a,b,c) a ## b(c)
#define STRINGIFY(name) PP1(PP,2,name)
/* Compute the partial fraction expansion coefficients (alpha) from the
* factored form */
namespace Grid {
namespace Approx {
static void construct_partfrac(izd *z) {
int dn = z -> dn, dd = z -> dd, type = z -> type;
int j, k, da = dd + 1 + type;
INTERNAL_PRECISION A = z -> A, *a = z -> a, *ap = z -> ap, *alpha;
alpha = (INTERNAL_PRECISION*) malloc(da * sizeof(INTERNAL_PRECISION));
for (j = 0; j < dd; j++)
for (k = 0, alpha[j] = A; k < dd; k++)
alpha[j] *=
(k < dn ? ap[j] - a[k] : ONE) / (k == j ? ONE : ap[j] - ap[k]);
if(type == 1) /* implicit pole at zero? */
for (k = 0, alpha[dd] = A * (dn > dd ? - a[dd] : ONE); k < dd; k++) {
alpha[dd] *= a[k] / ap[k];
alpha[k] *= (dn > dd ? ap[k] - a[dd] : ONE) / ap[k];
}
alpha[da-1] = dn == da - 1 ? A : ZERO;
z -> alpha = alpha;
z -> da = da;
return;
}
/* Convert factored polynomial into dense polynomial. The input is the overall
* factor A and the roots a[i], such that p = A product(x - a[i], i = 1..d) */
static INTERNAL_PRECISION *poly_factored_to_dense(INTERNAL_PRECISION A,
INTERNAL_PRECISION *a,
int d) {
INTERNAL_PRECISION *p;
int i, j;
p = (INTERNAL_PRECISION *) malloc((d + 2) * sizeof(INTERNAL_PRECISION));
p[0] = A;
for (i = 0; i < d; i++) {
p[i+1] = p[i];
for (j = i; j > 0; j--) p[j] = p[j-1] - a[i]*p[j];
p[0] *= - a[i];
}
return p;
}
/* Convert a rational function of the form R0(x) = x p(x^2)/q(x^2) (type 0) or
* R1(x) = p(x^2)/[x q(x^2)] (type 1) into its continued fraction
* representation. We assume that 0 <= deg(q) - deg(p) <= 1 for type 0 and 0 <=
* deg(p) - deg(q) <= 1 for type 1. On input p and q are in factored form, and
* deg(q) = dq, deg(p) = dp. The output is the continued fraction coefficients
* beta, where R(x) = beta[0] x + 1/(beta[1] x + 1/(...)).
*
* The method used is as follows. There are four cases to consider:
*
* 0.i. Type 0, deg p = deg q
*
* 0.ii. Type 0, deg p = deg q - 1
*
* 1.i. Type 1, deg p = deg q
*
* 1.ii. Type 1, deg p = deg q + 1
*
* and these are connected by two transformations:
*
* A. To obtain a continued fraction expansion of type 1 we use a single-step
* polynomial division we find beta and r(x) such that p(x) = beta x q(x) +
* r(x), with deg(r) = deg(q). This implies that p(x^2) = beta x^2 q(x^2) +
* r(x^2), and thus R1(x) = x beta + r(x^2)/(x q(x^2)) = x beta + 1/R0(x)
* with R0(x) = x q(x^2)/r(x^2).
*
* B. A continued fraction expansion of type 0 is obtained in a similar, but
* not identical, manner. We use the polynomial division algorithm to compute
* the quotient beta and the remainder r that satisfy p(x) = beta q(x) + r(x)
* with deg(r) = deg(q) - 1. We thus have x p(x^2) = x beta q(x^2) + x r(x^2),
* so R0(x) = x beta + x r(x^2)/q(x^2) = x beta + 1/R1(x) with R1(x) = q(x^2) /
* (x r(x^2)).
*
* Note that the deg(r) must be exactly deg(q) for (A) and deg(q) - 1 for (B)
* because p and q have disjoint roots all of multiplicity 1. This means that
* the division algorithm requires only a single polynomial subtraction step.
*
* The transformations between the cases form the following finite state
* automaton:
*
* +------+ +------+ +------+ +------+
* | | | | ---(A)---> | | | |
* | 0.ii | ---(B)---> | 1.ii | | 0.i | <---(A)--- | 1.i |
* | | | | <---(B)--- | | | |
* +------+ +------+ +------+ +------+
*/
static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *,
INTERNAL_PRECISION *,
INTERNAL_PRECISION *,
INTERNAL_PRECISION *, int, int);
static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *,
INTERNAL_PRECISION *,
INTERNAL_PRECISION *,
INTERNAL_PRECISION *, int, int);
static void construct_contfrac(izd *z){
INTERNAL_PRECISION *r, A = z -> A, *p = z -> a, *q = z -> ap;
int dp = z -> dn, dq = z -> dd, type = z -> type;
z -> db = 2 * dq + 1 + type;
z -> beta = (INTERNAL_PRECISION *)
malloc(z -> db * sizeof(INTERNAL_PRECISION));
p = poly_factored_to_dense(A, p, dp);
q = poly_factored_to_dense(ONE, q, dq);
r = (INTERNAL_PRECISION *) malloc((MAX(dp,dq) + 1) *
sizeof(INTERNAL_PRECISION));
if (type == 0) (void) contfrac_B(z -> beta, p, q, r, dp, dq);
else (void) contfrac_A(z -> beta, p, q, r, dp, dq);
free(p); free(q); free(r);
return;
}
static INTERNAL_PRECISION *contfrac_A(INTERNAL_PRECISION *beta,
INTERNAL_PRECISION *p,
INTERNAL_PRECISION *q,
INTERNAL_PRECISION *r, int dp, int dq) {
INTERNAL_PRECISION quot, *rb;
int j;
/* p(x) = x beta q(x) + r(x); dp = dq or dp = dq + 1 */
quot = dp == dq ? ZERO : p[dp] / q[dq];
r[0] = p[0];
for (j = 1; j <= dp; j++) r[j] = p[j] - quot * q[j-1];
#ifdef DEBUG
printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
__FUNCTION__, dp, dq, (float) quot);
for (j = 0; j <= dq + 1; j++)
printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
j, (float) (j > dp ? ZERO : p[j]),
j, (float) (j == 0 ? ZERO : q[j-1]),
j, (float) (j == dp ? ZERO : r[j]));
#endif /* DEBUG */
*(rb = contfrac_B(beta, q, r, p, dq, dq)) = quot;
return rb + 1;
}
static INTERNAL_PRECISION *contfrac_B(INTERNAL_PRECISION *beta,
INTERNAL_PRECISION *p,
INTERNAL_PRECISION *q,
INTERNAL_PRECISION *r, int dp, int dq) {
INTERNAL_PRECISION quot, *rb;
int j;
/* p(x) = beta q(x) + r(x); dp = dq or dp = dq - 1 */
quot = dp == dq ? p[dp] / q[dq] : ZERO;
for (j = 0; j < dq; j++) r[j] = p[j] - quot * q[j];
#ifdef DEBUG
printf("%s: Continued Fraction form: deg p = %2d, deg q = %2d, beta = %g\n",
__FUNCTION__, dp, dq, (float) quot);
for (j = 0; j <= dq; j++)
printf("\tp[%2d] = %14.6g\tq[%2d] = %14.6g\tr[%2d] = %14.6g\n",
j, (float) (j > dp ? ZERO : p[j]),
j, (float) q[j],
j, (float) (j == dq ? ZERO : r[j]));
#endif /* DEBUG */
*(rb = dq > 0 ? contfrac_A(beta, q, r, p, dq, dq-1) : beta) = quot;
return rb + 1;
}
/* The global variable U is used to hold the argument u throughout the AGM
* recursion. The global variables F and K are set in the innermost
* instantiation of the recursive function AGM to the values of the elliptic
* integrals F(u,k) and K(k) respectively. They must be made thread local to
* make this code thread-safe in a multithreaded environment. */
static INTERNAL_PRECISION U, F, K; /* THREAD LOCAL */
/* Recursive implementation of Gauss' arithmetico-geometric mean, which is the
* kernel of the method used to compute the Jacobian elliptic functions
* sn(u,k), cn(u,k), and dn(u,k) with parameter k (where 0 < k < 1), as well
* as the elliptic integral F(s,k) satisfying F(sn(u,k)) = u and the complete
* elliptic integral K(k).
*
* The algorithm used is a recursive implementation of the Gauss (Landen)
* transformation.
*
* The function returns the value of sn(u,k'), where k' is the dual parameter,
* and also sets the values of the global variables F and K. The latter is
* used to determine the sign of cn(u,k').
*
* The algorithm is deemed to have converged when b ceases to increase. This
* works whatever INTERNAL_PRECISION is specified. */
static INTERNAL_PRECISION AGM(INTERNAL_PRECISION a,
INTERNAL_PRECISION b,
INTERNAL_PRECISION s) {
static INTERNAL_PRECISION pb = -ONE;
INTERNAL_PRECISION c, d, xi;
if (b <= pb) {
pb = -ONE;
F = asin(s) / a; /* Here, a is the AGM */
K = M_PI / (TWO * a);
return sin(U * a);
}
pb = b;
c = a - b;
d = a + b;
xi = AGM(HALF*d, sqrt(a*b), ONE + c*c == ONE ?
HALF*s*d/a : (a - sqrt(a*a - s*s*c*d))/(c*s));
return 2*a*xi / (d + c*xi*xi);
}
/* Computes sn(u,k), cn(u,k), dn(u,k), F(u,k), and K(k). It is essentially a
* wrapper for the routine AGM. The sign of cn(u,k) is defined to be -1 if
* K(k) < u < 3*K(k) and +1 otherwise, and thus sign is computed by some quite
* unnecessarily obfuscated bit manipulations. */
static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
INTERNAL_PRECISION* sn, INTERNAL_PRECISION* cn,
INTERNAL_PRECISION* dn, INTERNAL_PRECISION* elF,
INTERNAL_PRECISION* elK) {
int sgn;
U = u;
*sn = AGM(ONE, sqrt(ONE - k*k), u);
sgn = ((int) (fabs(u) / K)) % 4; /* sgn = 0, 1, 2, 3 */
sgn ^= sgn >> 1; /* (sgn & 1) = 0, 1, 1, 0 */
sgn = 1 - ((sgn & 1) << 1); /* sgn = 1, -1, -1, 1 */
*cn = ((INTERNAL_PRECISION) sgn) * sqrt(ONE - *sn * *sn);
*dn = sqrt(ONE - k*k* *sn * *sn);
*elF = F;
*elK = K;
}
/* Compute the coefficients for the optimal rational approximation R(x) to
* sgn(x) of degree n over the interval epsilon < |x| < 1 using Zolotarev's
* formula.
*
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
* type = 1 for the approximation which is infinite at x = 0. */
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
l, invlambda, xi, xisq, *tv, s, opl;
int m, czero, ts;
zolotarev_data *zd;
izd *d = (izd*) malloc(sizeof(izd));
d -> type = type;
d -> epsilon = (INTERNAL_PRECISION) epsilon;
d -> n = n;
d -> dd = n / 2;
d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
d -> deg_denom = 2 * d -> dd;
d -> deg_num = 2 * d -> dn + 1;
d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
sizeof(INTERNAL_PRECISION));
d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
sizeof(INTERNAL_PRECISION));
ksq = d -> epsilon * d -> epsilon;
kp = sqrt(ONE - ksq);
sncndnFK(ZERO, kp, &sn, &cn, &dn, &F, &Kp); /* compute Kp = K(kp) */
z0 = TWO * Kp / (INTERNAL_PRECISION) n;
M = ONE;
A = ONE / d -> epsilon;
sncndnFK(HALF * z0, kp, &sn, &cn, &dn, &F, &Kj); /* compute xi */
xi = ONE / dn;
xisq = xi * xi;
invlambda = xi;
for (m = 0; m < d -> dd; m++) {
czero = 2 * (m + 1) == n; /* n even and m = dd -1 */
z = z0 * ((INTERNAL_PRECISION) m + ONE);
sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
t = cn / sn;
c = - t*t;
if (!czero) (d -> a)[d -> dn - 1 - m] = ksq / c;
z = z0 * ((INTERNAL_PRECISION) m + HALF);
sncndnFK(z, kp, &sn, &cn, &dn, &F, &Kj);
t = cn / sn;
cp = - t*t;
(d -> ap)[d -> dd - 1 - m] = ksq / cp;
M *= (ONE - c) / (ONE - cp);
A *= (czero ? -ksq : c) * (ONE - cp) / (cp * (ONE - c));
invlambda *= (ONE - c*xisq) / (ONE - cp*xisq);
}
invlambda /= M;
d -> A = TWO / (ONE + invlambda) * A;
d -> Delta = (invlambda - ONE) / (invlambda + ONE);
d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
sizeof(INTERNAL_PRECISION));
l = ONE / invlambda;
opl = ONE + l;
sncndnFK(sqrt( d -> type == 1
? (THREE + l) / (FOUR * opl)
: (ONE + THREE*l) / (opl*opl*opl)
), sqrt(ONE - l*l), &sn, &cn, &dn, &F, &Kj);
s = M * F;
for (m = 0; m < d -> n; m++) {
sncndnFK(s + TWO*Kp*m/n, kp, &sn, &cn, &dn, &F, &Kj);
d -> gamma[m] = d -> epsilon / dn;
}
/* If R(x) is a Zolotarev rational approximation of degree (n,m) with maximum
* error Delta, then (1 - Delta^2) / R(x) is also an optimal Chebyshev
* approximation of degree (m,n) */
if (d -> type == 1) {
d -> A = (ONE - d -> Delta * d -> Delta) / d -> A;
tv = d -> a; d -> a = d -> ap; d -> ap = tv;
ts = d -> dn; d -> dn = d -> dd; d -> dd = ts;
ts = d -> deg_num; d -> deg_num = d -> deg_denom; d -> deg_denom = ts;
}
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
zd -> dd = d -> dd;
zd -> da = d -> da;
zd -> db = d -> db;
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
return zd;
}
void zolotarev_free(zolotarev_data *zdata)
{
free(zdata -> a);
free(zdata -> ap);
free(zdata -> alpha);
free(zdata -> beta);
free(zdata -> gamma);
free(zdata);
}
zolotarev_data* higham(PRECISION epsilon, int n) {
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
int m, czero;
zolotarev_data *zd;
izd *d = (izd*) malloc(sizeof(izd));
d -> type = 0;
d -> epsilon = (INTERNAL_PRECISION) epsilon;
d -> n = n;
d -> dd = n / 2;
d -> dn = d -> dd - 1 + n % 2; /* n even: dn = dd - 1, n odd: dn = dd */
d -> deg_denom = 2 * d -> dd;
d -> deg_num = 2 * d -> dn + 1;
d -> a = (INTERNAL_PRECISION*) malloc((1 + d -> dn) *
sizeof(INTERNAL_PRECISION));
d -> ap = (INTERNAL_PRECISION*) malloc(d -> dd *
sizeof(INTERNAL_PRECISION));
A = (INTERNAL_PRECISION) n;
z0 = M_PI / A;
A = n % 2 == 0 ? A : ONE / A;
M = d -> epsilon * A;
epssq = d -> epsilon * d -> epsilon;
for (m = 0; m < d -> dd; m++) {
czero = 2 * (m + 1) == n; /* n even and m = dd - 1*/
if (!czero) {
z = z0 * ((INTERNAL_PRECISION) m + ONE);
t = tan(z);
c = - t*t;
(d -> a)[d -> dn - 1 - m] = c;
M *= epssq - c;
}
z = z0 * ((INTERNAL_PRECISION) m + HALF);
t = tan(z);
cp = - t*t;
(d -> ap)[d -> dd - 1 - m] = cp;
M /= epssq - cp;
}
d -> gamma = (INTERNAL_PRECISION*) malloc((1 + d -> n) *
sizeof(INTERNAL_PRECISION));
for (m = 0; m < d -> n; m++) d -> gamma[m] = ONE;
d -> A = A;
d -> Delta = ONE - M;
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
zd -> dd = d -> dd;
zd -> da = d -> da;
zd -> db = d -> db;
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
return zd;
}
}}
#ifdef TEST
#undef ZERO
#define ZERO ((PRECISION) 0)
#undef ONE
#define ONE ((PRECISION) 1)
#undef TWO
#define TWO ((PRECISION) 2)
/* Evaluate the rational approximation R(x) using the factored form */
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R;
if (rdata -> type == 0) {
R = rdata -> A * x;
for (m = 0; m < rdata -> deg_denom/2; m++)
R *= (2*(m+1) > rdata -> deg_num ? ONE : x*x - rdata -> a[m]) /
(x*x - rdata -> ap[m]);
} else {
R = rdata -> A / x;
for (m = 0; m < rdata -> deg_num/2; m++)
R *= (x*x - rdata -> a[m]) /
(2*(m+1) > rdata -> deg_denom ? ONE : x*x - rdata -> ap[m]);
}
return R;
}
/* Evaluate the rational approximation R(x) using the partial fraction form */
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> alpha[rdata -> da - 1];
for (m = 0; m < rdata -> dd; m++)
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
return R * x;
}
/* Evaluate the rational approximation R(x) using continued fraction form.
*
* If x = 0 and type = 1 then the result should be INF, whereas if x = 0 and
* type = 0 then the result should be 0, but division by zero will occur at
* intermediate stages of the evaluation. For IEEE implementations with
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
* but with signalling overflow you will get an error message. */
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION R = rdata -> beta[0] * x;
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
return R;
}
/* Evaluate the rational approximation R(x) using Cayley form */
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
int m;
PRECISION T;
T = rdata -> type == 0 ? ONE : -ONE;
for (m = 0; m < rdata -> n; m++)
T *= (rdata -> gamma[m] - x) / (rdata -> gamma[m] + x);
return (ONE - T) / (ONE + T);
}
/* Test program. Apart from printing out the parameters for R(x) it produces
* the following data files for plotting (unless NPLOT is defined):
*
* zolotarev-fn is a plot of R(x) for |x| < 1.2. This should look like sgn(x).
*
* zolotarev-err is a plot of the error |R(x) - sgn(x)| scaled by 1/Delta. This
* should oscillate deg_num + deg_denom + 2 times between +1 and -1 over the
* domain epsilon <= |x| <= 1.
*
* If ALLPLOTS is defined then zolotarev-partfrac (zolotarev-contfrac) is a
* plot of the difference between the values of R(x) computed using the
* factored form and the partial fraction (continued fraction) form, scaled by
* 1/Delta. It should be zero everywhere. */
int main(int argc, char** argv) {
int m, n, plotpts = 5000, type = 0;
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
zolotarev_data *rdata;
PRECISION y;
FILE *plot_function, *plot_error,
*plot_partfrac, *plot_contfrac, *plot_cayley;
if (argc < 3 || argc > 4) {
fprintf(stderr, "Usage: %s epsilon n [type]\n", *argv);
exit(EXIT_FAILURE);
}
sscanf(argv[1], "%g", &eps); /* First argument is epsilon */
sscanf(argv[2], "%d", &n); /* Second argument is n */
if (argc == 4) sscanf(argv[3], "%d", &type); /* Third argument is type */
if (type < 0 || type > 2) {
fprintf(stderr, "%s: type must be 0 (Zolotarev R(0) = 0),\n"
"\t\t1 (Zolotarev R(0) = Inf, or 2 (Higham)\n", *argv);
exit(EXIT_FAILURE);
}
rdata = type == 2
? higham((PRECISION) eps, n)
: zolotarev((PRECISION) eps, n, type);
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
"\tPRECISION = " STRINGIFY(PRECISION)
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
"\tDelta = %g (maximum error)\n\n"
"\tA = %g (overall factor)\n",
(float) rdata -> epsilon, rdata -> n, type,
rdata -> deg_num, rdata -> deg_denom,
rdata -> type == 1 ? "infinite" : "zero",
(float) rdata -> Delta, (float) rdata -> A);
for (m = 0; m < MIN(rdata -> dd, rdata -> dn); m++)
printf("\ta[%2d] = %14.8g\t\ta'[%2d] = %14.8g\n",
m + 1, (float) rdata -> a[m], m + 1, (float) rdata -> ap[m]);
if (rdata -> dd > rdata -> dn)
printf("\t\t\t\t\ta'[%2d] = %14.8g\n",
rdata -> dn + 1, (float) rdata -> ap[rdata -> dn]);
if (rdata -> dd < rdata -> dn)
printf("\ta[%2d] = %14.8g\n",
rdata -> dd + 1, (float) rdata -> a[rdata -> dd]);
printf("\n\tPartial fraction coefficients\n");
printf("\talpha[ 0] = %14.8g\n",
(float) rdata -> alpha[rdata -> da - 1]);
for (m = 0; m < rdata -> dd; m++)
printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
m + 1, (float) rdata -> alpha[m], m + 1, (float) rdata -> ap[m]);
if (rdata -> type == 1)
printf("\talpha[%2d] = %14.8g\ta'[%2d] = %14.8g\n",
rdata -> dd + 1, (float) rdata -> alpha[rdata -> dd],
rdata -> dd + 1, (float) ZERO);
printf("\n\tContinued fraction coefficients\n");
for (m = 0; m < rdata -> db; m++)
printf("\tbeta[%2d] = %14.8g\n", m, (float) rdata -> beta[m]);
printf("\n\tCayley transform coefficients\n");
for (m = 0; m < rdata -> n; m++)
printf("\tgamma[%2d] = %14.8g\n", m, (float) rdata -> gamma[m]);
#ifndef NPLOT
plot_function = fopen("zolotarev-fn.dat", "w");
plot_error = fopen("zolotarev-err.dat", "w");
#ifdef ALLPLOTS
plot_partfrac = fopen("zolotarev-partfrac.dat", "w");
plot_contfrac = fopen("zolotarev-contfrac.dat", "w");
plot_cayley = fopen("zolotarev-cayley.dat", "w");
#endif /* ALLPLOTS */
for (m = 0, maxypferr = maxycferr = maxycaylerr = 0.0; m <= plotpts; m++) {
x = 2.4 * (float) m / plotpts - 1.2;
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
/* skip x = 0 for type 1, as R(0) is singular */
y = zolotarev_eval((PRECISION) x, rdata);
fprintf(plot_function, "%g %g\n", x, (float) y);
fprintf(plot_error, "%g %g\n",
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
maxypferr = MAX(maxypferr, fabs(ypferr));
maxycferr = MAX(maxycferr, fabs(ycferr));
maxycaylerr = MAX(maxycaylerr, fabs(ycaylerr));
}
#ifdef ALLPLOTS
fprintf(plot_partfrac, "%g %g\n", x, ypferr);
fprintf(plot_contfrac, "%g %g\n", x, ycferr);
fprintf(plot_cayley, "%g %g\n", x, ycaylerr);
#endif /* ALLPLOTS */
}
}
#ifdef ALLPLOTS
fclose(plot_cayley);
fclose(plot_contfrac);
fclose(plot_partfrac);
#endif /* ALLPLOTS */
fclose(plot_error);
fclose(plot_function);
printf("\n\tMaximum PF error = %g (relative to Delta)\n", maxypferr);
printf("\tMaximum CF error = %g (relative to Delta)\n", maxycferr);
printf("\tMaximum Cayley error = %g (relative to Delta)\n", maxycaylerr);
#endif /* NPLOT */
free(rdata -> a);
free(rdata -> ap);
free(rdata -> alpha);
free(rdata -> beta);
free(rdata -> gamma);
free(rdata);
return EXIT_SUCCESS;
}
#endif /* TEST */

View File

@ -0,0 +1,87 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
#ifdef __cplusplus
namespace Grid {
namespace Approx {
#endif
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION
#define PRECISION double
#endif
#define ZPRECISION PRECISION
#define ZOLOTAREV_DATA zolotarev_data
#endif
/* This struct contains the coefficients which parameterise an optimal rational
* approximation to the signum function.
*
* The parameterisations used are:
*
* Factored form for type 0 (R0(0) = 0)
*
* R0(x) = A * x * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
* .. dd-1),
*
* where deg_num = 2*dn + 1 and deg_denom = 2*dd.
*
* Factored form for type 1 (R1(0) = infinity)
*
* R1(x) = (A / x) * prod(x^2 - a[j], j = 0 .. dn-1) / prod(x^2 - ap[j], j = 0
* .. dd-1),
*
* where deg_num = 2*dn and deg_denom = 2*dd + 1.
*
* Partial fraction form
*
* R(x) = alpha[da] * x + sum(alpha[j] * x / (x^2 - ap[j]), j = 0 .. da-1)
*
* where da = dd for type 0 and da = dd + 1 with ap[dd] = 0 for type 1.
*
* Continued fraction form
*
* R(x) = beta[db-1] * x + 1 / (beta[db-2] * x + 1 / (beta[db-3] * x + ...))
*
* with the final coefficient being beta[0], with d' = 2 * dd + 1 for type 0
* and db = 2 * dd + 2 for type 1.
*
* Cayley form (Chiu's domain wall formulation)
*
* R(x) = (1 - T(x)) / (1 + T(x))
*
* where T(x) = prod((x - gamma[j]) / (x + gamma[j]), j = 0 .. n-1)
*/
typedef struct {
ZPRECISION *a, /* zeros of numerator, a[0 .. dn-1] */
*ap, /* poles (zeros of denominator), ap[0 .. dd-1] */
A, /* overall factor */
*alpha, /* coefficients of partial fraction, alpha[0 .. da-1] */
*beta, /* coefficients of continued fraction, beta[0 .. db-1] */
*gamma, /* zeros of numerator of T in Cayley form */
Delta, /* maximum error, |R(x) - sgn(x)| <= Delta */
epsilon; /* minimum x value, epsilon < |x| < 1 */
int n, /* approximation degree */
type, /* 0: R(0) = 0, 1: R(0) = infinity */
dn, dd, da, db, /* number of elements of a, ap, alpha, and beta */
deg_num, /* degree of numerator = deg_denom +/- 1 */
deg_denom; /* degree of denominator */
} ZOLOTAREV_DATA;
#ifndef ZOLOTAREV_INTERNAL
/* zolotarev(epsilon, n, type) returns a pointer to an initialised
* zolotarev_data structure. The arguments must satisfy the constraints that
* epsilon > 0, n > 0, and type = 0 or 1. */
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
void zolotarev_free(zolotarev_data *zdata);
#endif
#ifdef __cplusplus
}}
#endif

View File

@ -0,0 +1,206 @@
/*
Mike Clark - 25th May 2005
bigfloat.h
Simple C++ wrapper for multiprecision datatype used by AlgRemez
algorithm
*/
#ifndef INCLUDED_BIGFLOAT_H
#define INCLUDED_BIGFLOAT_H
#include <gmp.h>
#include <mpf2mpfr.h>
#include <mpfr.h>
class bigfloat {
private:
mpf_t x;
public:
bigfloat() { mpf_init(x); }
bigfloat(const bigfloat& y) { mpf_init_set(x, y.x); }
bigfloat(const unsigned long u) { mpf_init_set_ui(x, u); }
bigfloat(const long i) { mpf_init_set_si(x, i); }
bigfloat(const int i) {mpf_init_set_si(x,(long)i);}
bigfloat(const float d) { mpf_init_set_d(x, (double)d); }
bigfloat(const double d) { mpf_init_set_d(x, d); }
bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); }
~bigfloat(void) { mpf_clear(x); }
operator double (void) const { return (double)mpf_get_d(x); }
static void setDefaultPrecision(unsigned long dprec) {
unsigned long bprec = (unsigned long)(3.321928094 * (double)dprec);
mpf_set_default_prec(bprec);
}
void setPrecision(unsigned long dprec) {
unsigned long bprec = (unsigned long)(3.321928094 * (double)dprec);
mpf_set_prec(x,bprec);
}
unsigned long getPrecision(void) const { return mpf_get_prec(x); }
unsigned long getDefaultPrecision(void) const { return mpf_get_default_prec(); }
bigfloat& operator=(const bigfloat& y) {
mpf_set(x, y.x);
return *this;
}
bigfloat& operator=(const unsigned long y) {
mpf_set_ui(x, y);
return *this;
}
bigfloat& operator=(const signed long y) {
mpf_set_si(x, y);
return *this;
}
bigfloat& operator=(const float y) {
mpf_set_d(x, (double)y);
return *this;
}
bigfloat& operator=(const double y) {
mpf_set_d(x, y);
return *this;
}
size_t write(void);
size_t read(void);
/* Arithmetic Functions */
bigfloat& operator+=(const bigfloat& y) { return *this = *this + y; }
bigfloat& operator-=(const bigfloat& y) { return *this = *this - y; }
bigfloat& operator*=(const bigfloat& y) { return *this = *this * y; }
bigfloat& operator/=(const bigfloat& y) { return *this = *this / y; }
friend bigfloat operator+(const bigfloat& x, const bigfloat& y) {
bigfloat a;
mpf_add(a.x,x.x,y.x);
return a;
}
friend bigfloat operator+(const bigfloat& x, const unsigned long y) {
bigfloat a;
mpf_add_ui(a.x,x.x,y);
return a;
}
friend bigfloat operator-(const bigfloat& x, const bigfloat& y) {
bigfloat a;
mpf_sub(a.x,x.x,y.x);
return a;
}
friend bigfloat operator-(const unsigned long x, const bigfloat& y) {
bigfloat a;
mpf_ui_sub(a.x,x,y.x);
return a;
}
friend bigfloat operator-(const bigfloat& x, const unsigned long y) {
bigfloat a;
mpf_sub_ui(a.x,x.x,y);
return a;
}
friend bigfloat operator-(const bigfloat& x) {
bigfloat a;
mpf_neg(a.x,x.x);
return a;
}
friend bigfloat operator*(const bigfloat& x, const bigfloat& y) {
bigfloat a;
mpf_mul(a.x,x.x,y.x);
return a;
}
friend bigfloat operator*(const bigfloat& x, const unsigned long y) {
bigfloat a;
mpf_mul_ui(a.x,x.x,y);
return a;
}
friend bigfloat operator/(const bigfloat& x, const bigfloat& y){
bigfloat a;
mpf_div(a.x,x.x,y.x);
return a;
}
friend bigfloat operator/(const unsigned long x, const bigfloat& y){
bigfloat a;
mpf_ui_div(a.x,x,y.x);
return a;
}
friend bigfloat operator/(const bigfloat& x, const unsigned long y){
bigfloat a;
mpf_div_ui(a.x,x.x,y);
return a;
}
friend bigfloat sqrt_bf(const bigfloat& x){
bigfloat a;
mpf_sqrt(a.x,x.x);
return a;
}
friend bigfloat sqrt_bf(const unsigned long x){
bigfloat a;
mpf_sqrt_ui(a.x,x);
return a;
}
friend bigfloat abs_bf(const bigfloat& x){
bigfloat a;
mpf_abs(a.x,x.x);
return a;
}
friend bigfloat pow_bf(const bigfloat& a, long power) {
bigfloat b;
mpf_pow_ui(b.x,a.x,power);
return b;
}
friend bigfloat pow_bf(const bigfloat& a, bigfloat &power) {
bigfloat b;
mpfr_pow(b.x,a.x,power.x,GMP_RNDN);
return b;
}
friend bigfloat exp_bf(const bigfloat& a) {
bigfloat b;
mpfr_exp(b.x,a.x,GMP_RNDN);
return b;
}
/* Comparison Functions */
friend int operator>(const bigfloat& x, const bigfloat& y) {
int test;
test = mpf_cmp(x.x,y.x);
if (test > 0) return 1;
else return 0;
}
friend int operator<(const bigfloat& x, const bigfloat& y) {
int test;
test = mpf_cmp(x.x,y.x);
if (test < 0) return 1;
else return 0;
}
friend int sgn(const bigfloat&);
};
#endif

View File

@ -0,0 +1,189 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/bigfloat_double.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <math.h>
typedef double mfloat;
class bigfloat {
private:
mfloat x;
public:
bigfloat() { }
bigfloat(const bigfloat& y) { x=y.x; }
bigfloat(const unsigned long u) { x=u; }
bigfloat(const long i) { x=i; }
bigfloat(const int i) { x=i;}
bigfloat(const float d) { x=d;}
bigfloat(const double d) { x=d;}
bigfloat(const char *str) { x=std::stod(std::string(str));}
~bigfloat(void) { }
operator double (void) const { return (double)x; }
static void setDefaultPrecision(unsigned long dprec) {
}
void setPrecision(unsigned long dprec) {
}
unsigned long getPrecision(void) const { return 64; }
unsigned long getDefaultPrecision(void) const { return 64; }
bigfloat& operator=(const bigfloat& y) { x=y.x; return *this; }
bigfloat& operator=(const unsigned long y) { x=y; return *this; }
bigfloat& operator=(const signed long y) { x=y; return *this; }
bigfloat& operator=(const float y) { x=y; return *this; }
bigfloat& operator=(const double y) { x=y; return *this; }
size_t write(void);
size_t read(void);
/* Arithmetic Functions */
bigfloat& operator+=(const bigfloat& y) { return *this = *this + y; }
bigfloat& operator-=(const bigfloat& y) { return *this = *this - y; }
bigfloat& operator*=(const bigfloat& y) { return *this = *this * y; }
bigfloat& operator/=(const bigfloat& y) { return *this = *this / y; }
friend bigfloat operator+(const bigfloat& x, const bigfloat& y) {
bigfloat a;
a.x=x.x+y.x;
return a;
}
friend bigfloat operator+(const bigfloat& x, const unsigned long y) {
bigfloat a;
a.x=x.x+y;
return a;
}
friend bigfloat operator-(const bigfloat& x, const bigfloat& y) {
bigfloat a;
a.x=x.x-y.x;
return a;
}
friend bigfloat operator-(const unsigned long x, const bigfloat& y) {
bigfloat bx(x);
return bx-y;
}
friend bigfloat operator-(const bigfloat& x, const unsigned long y) {
bigfloat by(y);
return x-by;
}
friend bigfloat operator-(const bigfloat& x) {
bigfloat a;
a.x=-x.x;
return a;
}
friend bigfloat operator*(const bigfloat& x, const bigfloat& y) {
bigfloat a;
a.x=x.x*y.x;
return a;
}
friend bigfloat operator*(const bigfloat& x, const unsigned long y) {
bigfloat a;
a.x=x.x*y;
return a;
}
friend bigfloat operator/(const bigfloat& x, const bigfloat& y){
bigfloat a;
a.x=x.x/y.x;
return a;
}
friend bigfloat operator/(const unsigned long x, const bigfloat& y){
bigfloat bx(x);
return bx/y;
}
friend bigfloat operator/(const bigfloat& x, const unsigned long y){
bigfloat by(y);
return x/by;
}
friend bigfloat sqrt_bf(const bigfloat& x){
bigfloat a;
a.x= sqrt(x.x);
return a;
}
friend bigfloat sqrt_bf(const unsigned long x){
bigfloat a(x);
return sqrt_bf(a);
}
friend bigfloat abs_bf(const bigfloat& x){
bigfloat a;
a.x=fabs(x.x);
return a;
}
friend bigfloat pow_bf(const bigfloat& a, long power) {
bigfloat b;
b.x=pow(a.x,power);
return b;
}
friend bigfloat pow_bf(const bigfloat& a, bigfloat &power) {
bigfloat b;
b.x=pow(a.x,power.x);
return b;
}
friend bigfloat exp_bf(const bigfloat& a) {
bigfloat b;
b.x=exp(a.x);
return b;
}
/* Comparison Functions */
friend int operator>(const bigfloat& x, const bigfloat& y) {
return x.x>y.x;
}
friend int operator<(const bigfloat& x, const bigfloat& y) {
return x.x<y.x;
}
friend int sgn(const bigfloat& x) {
if ( x.x>=0 ) return 1;
else return 0;
}
/* Miscellaneous Functions */
// friend bigfloat& random(void);
};

View File

@ -0,0 +1,397 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/AdefGeneric.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
#define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
/*
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
* Script A = SolverMatrix
* Script P = Preconditioner
*
* Deflation methods considered
* -- Solve P A x = P b [ like Luscher ]
* DEF-1 M P A x = M P b [i.e. left precon]
* DEF-2 P^T M A x = P^T M b
* ADEF-1 Preconditioner = M P + Q [ Q + M + M A Q]
* ADEF-2 Preconditioner = P^T M + Q
* BNN Preconditioner = P^T M P + Q
* BNN2 Preconditioner = M P + P^TM +Q - M P A M
*
* Implement ADEF-2
*
* Vstart = P^Tx + Qb
* M1 = P^TM + Q
* M2=M3=1
* Vout = x
*/
// abstract base
template<class Field, class CoarseField>
class TwoLevelFlexiblePcg : public LinearFunction<Field>
{
public:
int verbose;
RealD Tolerance;
Integer MaxIterations;
const int mmax = 5;
GridBase *grid;
GridBase *coarsegrid;
LinearOperatorBase<Field> *_Linop
OperatorFunction<Field> *_Smoother,
LinearFunction<CoarseField> *_CoarseSolver;
// Need somthing that knows how to get from Coarse to fine and back again
// more most opertor functions
TwoLevelFlexiblePcg(RealD tol,
Integer maxit,
LinearOperatorBase<Field> *Linop,
LinearOperatorBase<Field> *SmootherLinop,
OperatorFunction<Field> *Smoother,
OperatorFunction<CoarseField> CoarseLinop
) :
Tolerance(tol),
MaxIterations(maxit),
_Linop(Linop),
_PreconditionerLinop(PrecLinop),
_Preconditioner(Preconditioner)
{
verbose=0;
};
// The Pcg routine is common to all, but the various matrices differ from derived
// implementation to derived implmentation
void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){
psi.checkerboard = src.checkerboard;
grid = src._grid;
RealD f;
RealD rtzp,rtz,a,d,b;
RealD rptzp;
RealD tn;
RealD guess = norm2(psi);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
/////////////////////////////
// Set up history vectors
/////////////////////////////
std::vector<Field> p (mmax,grid);
std::vector<Field> mmp(mmax,grid);
std::vector<RealD> pAp(mmax);
Field x (grid); x = psi;
Field z (grid);
Field tmp(grid);
Field r (grid);
Field mu (grid);
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
x=src;
Vstart(x,src);
// r0 = b -A x0
HermOp(x,mmp); // Shouldn't this be something else?
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
M1(r,z,tmp,mp,SmootherMirs);
rtzp =real(innerProduct(r,z));
///////////////////////////////////////
// Solve for Mss mu = P A z and set p = z-mu
// Def2: p = 1 - Q Az = Pright z
// Other algos M2 is trivial
///////////////////////////////////////
M2(z,p[0]);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
rtz=rtzp;
d= M3(p[peri_k],mp,mmp[peri_k],tmp);
a = rtz/d;
// Memorise this
pAp[peri_k] = d;
axpy(x,a,p[peri_k],x);
RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
// Compute z = M x
M1(r,z,tmp,mp);
rtzp =real(innerProduct(r,z));
M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
p[peri_kp]=p[peri_k];
// Standard search direction p -> z + b p ; b =
b = (rtzp)/rtz;
int northog;
// northog = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
RealD beta = -pbApk/pAp[peri_back];
axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
}
RealD rrn=sqrt(rn/ssq);
std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
// Stopping condition
if ( rn <= rsq ) {
HermOp(x,mmp); // Shouldn't this be something else?
axpy(tmp,-1.0,src,mmp[0]);
RealD psinorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
return k;
}
}
// Non-convergence
assert(0);
}
public:
virtual void M(Field & in,Field & out,Field & tmp) {
}
virtual void M1(Field & in, Field & out) {// the smoother
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(grid);
Field Min(grid);
PcgM(in,Min); // Smoother call
HermOp(Min,out);
axpy(tmp,-1.0,out,in); // tmp = in - A Min
ProjectToSubspace(tmp,PleftProj);
ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
axpy(out,1.0,Min,tmp); // Min+tmp
}
virtual void M2(const Field & in, Field & out) {
out=in;
// Must override for Def2 only
// case PcgDef2:
// Pright(in,out);
// break;
}
virtual RealD M3(const Field & p, Field & mmp){
double d,dd;
HermOpAndNorm(p,mmp,d,dd);
return dd;
// Must override for Def1 only
// case PcgDef1:
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
// Pleft(mp,mmp);
// d=real(linop_d->inner(p,mmp));
}
virtual void VstartDef2(Field & xconst Field & src){
//case PcgDef2:
//case PcgAdef2:
//case PcgAdef2f:
//case PcgV11f:
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
// = [1 - Ass_inv A] Guess + Assinv src
// = P^T guess + Assinv src
// = Vstart [Tang notation]
// This gives:
// W^T (src - A x_0) = src_s - A guess_s - r_s
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
Field r(grid);
Field mmp(grid);
HermOp(x,mmp);
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
ProjectToSubspace(r,PleftProj);
ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
PromoteFromSubspace(PleftMss_proj,mmp);
x=x+mmp;
}
virtual void Vstart(Field & x,const Field & src){
return;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout. Override in Def1
/////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){
out = in;
//case PcgDef1:
// //Qb + PT x
// ProjectToSubspace(src,PleftProj);
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
// PromoteFromSubspace(PleftMss_proj,tmp);
//
// Pright(in,out);
//
// linop_d->axpy(out,tmp,out,1.0);
// break;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Pright and Pleft are common to all implementations
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void Pright(Field & in,Field & out){
// P_R = [ 1 0 ]
// [ -Mss^-1 Msb 0 ]
Field in_sbar(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
HermOp(in_sbar,out);
ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project)
ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
PromoteFromSubspace(PleftMss_proj,out); //
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
}
virtual void Pleft (Field & in,Field & out){
// P_L = [ 1 -Mbs Mss^-1]
// [ 0 0 ]
Field in_sbar(grid);
Field tmp2(grid);
Field Mtmp(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
PromoteFromSubspace(PleftMss_proj,out);
HermOp(out,Mtmp);
ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1}
PromoteFromSubspace(PleftProj,tmp2);
axpy(out,-1.0,tmp2,Mtmp);
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
}
}
template<class Field>
class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp){
}
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
}
virtual void M2(Field & in, Field & out){
}
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
}
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
}
}
/*
template<class Field>
class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
*/
#endif

View File

@ -0,0 +1,606 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
Copyright (C) 2017
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
namespace Grid {
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
template <class Field>
class BlockConjugateGradient : public OperatorFunction<Field> {
public:
typedef typename Field::scalar_type scomplex;
int blockDim ;
int Nblock;
BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
{};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
Field & Q,
const Field & R)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related
m_rr = 0.5*(m_rr+m_rr.adjoint());
#if 0
std::cout << " Calling Cholesky ldlt on m_rr " << m_rr <<std::endl;
Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL();
std::cout << " Called Cholesky ldlt on m_rr " << L_ldlt <<std::endl;
auto D_ldlt = m_rr.ldlt().vectorD();
std::cout << " Called Cholesky ldlt on m_rr " << D_ldlt <<std::endl;
#endif
// std::cout << " Calling Cholesky llt on m_rr " <<std::endl;
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// std::cout << " Called Cholesky llt on m_rr " << L <<std::endl;
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
// Q = R C^{-1}
//
// Q_j = R_i Cinv(i,j)
//
// NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
////////////////////////////////////////////////////////////////////////////////////////////////////
sliceMulMatrix(Q,Cinv,R,Orthog);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Call one of several implementations
////////////////////////////////////////////////////////////////////////////////////////////////////
void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
if ( CGtype == BlockCGrQ ) {
BlockCGrQsolve(Linop,Src,Psi);
} else if (CGtype == BlockCG ) {
BlockCGsolve(Linop,Src,Psi);
} else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi);
} else {
assert(0);
}
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQ implementation:
//--------------------------
// X is guess/Solution
// B is RHS
// Solve A X_i = B_i ; i refers to Nblock index
////////////////////////////////////////////////////////////////////////////
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.checkerboard = B.checkerboard;
conformable(X, B);
Field tmp(B);
Field Q(B);
Field D(B);
Field Z(B);
Field AD(B);
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,B,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
************************************************************************
* Dimensions:
*
* X,B==(Nferm x Nblock)
* A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
*
* QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
* for k:
* Z = AD
* M = [D^dag Z]^{-1}
* X = X + D MC
* QS = Q - ZM
* D = Q + D S^dag
* C = S C
*/
///////////////////////////////////////
// Initial block: initial search dir is guess
///////////////////////////////////////
std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
//1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
Linop.HermOp(X, AD);
tmp = B - AD;
//std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
//std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
//std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
//std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
//std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
//3. Z = AD
MatrixTimer.Start();
Linop.HermOp(D, Z);
MatrixTimer.Stop();
//std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
//5. X = X + D MC
m_tmp = m_M * m_C;
sliceMaddTimer.Start();
sliceMaddMatrix(X,m_tmp, D,X,Orthog);
sliceMaddTimer.Stop();
//6. QS = Q - ZM
sliceMaddTimer.Start();
sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
sliceMaddTimer.Stop();
QRTimer.Start();
ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
QRTimer.Stop();
//7. D = Q + D S^dag
m_tmp = m_S.adjoint();
sliceMaddTimer.Start();
sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
sliceMaddTimer.Stop();
//8. C = S C
m_C = m_S*m_C;
/*********************
* convergence monitor
*********************
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
RealD rrsum=0;
RealD rr;
for(int b=0;b<Nblock;b++) {
rrsum+=real(m_rr(b,b));
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(X, AD);
AD = AD-B;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_alpha = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_beta = Eigen::MatrixXcd::Zero(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
/************************************************************************
* Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
************************************************************************
* O'Leary : R = B - A X
* O'Leary : P = M R ; preconditioner M = 1
* O'Leary : alpha = PAP^{-1} RMR
* O'Leary : beta = RMR^{-1}_old RMR_new
* O'Leary : X=X+Palpha
* O'Leary : R_new=R_old-AP alpha
* O'Leary : P=MR_new+P beta
*/
R = Src - AP;
P = R;
sliceInnerProductMatrix(m_rr,R,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
m_pAp_inv = m_pAp.inverse();
m_alpha = m_pAp_inv * m_rr ;
// Psi, R update
sliceMaddTimer.Start();
sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddMatrix(R ,m_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
m_rr_inv = m_rr.inverse();
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_rr,R,R,Orthog);
sliceInnerTimer.Stop();
m_beta = m_rr_inv *m_rr;
// Search update
sliceMaddTimer.Start();
sliceMaddMatrix(AP,m_beta,P,R,Orthog);
sliceMaddTimer.Stop();
P= AP;
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
RealD rr;
for(int b=0;b<Nblock;b++){
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
// multiRHS conjugate gradient. Dimension zero should be the block direction
// Use this for spread out across nodes
//////////////////////////////////////////////////////////////////////////
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
std::vector<ComplexD> v_pAp(Nblock);
std::vector<RealD> v_rr (Nblock);
std::vector<RealD> v_rr_inv(Nblock);
std::vector<RealD> v_alpha(Nblock);
std::vector<RealD> v_beta(Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
R = Src - AP;
P = R;
sliceNorm(v_rr,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch sliceNormTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductVector(v_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
for(int b=0;b<Nblock;b++){
v_alpha[b] = v_rr[b]/real(v_pAp[b]);
}
// Psi, R update
sliceMaddTimer.Start();
sliceMaddVector(Psi,v_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddVector(R ,v_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
for(int b=0;b<Nblock;b++){
v_rr_inv[b] = 1.0/v_rr[b];
}
sliceNormTimer.Start();
sliceNorm(v_rr,R,Orthog);
sliceNormTimer.Stop();
for(int b=0;b<Nblock;b++){
v_beta[b] = v_rr_inv[b] *v_rr[b];
}
// Search update
sliceMaddTimer.Start();
sliceMaddVector(P,v_beta,P,R,Orthog);
sliceMaddTimer.Stop();
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
for(int b=0;b<Nblock;b++){
RealD rr = v_rr[b]/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << sliceNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}
#endif

View File

@ -0,0 +1,177 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradient.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_H
#define GRID_CONJUGATE_GRADIENT_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template <class Field>
class ConjugateGradient : public OperatorFunction<Field> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard;
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
Field p(src);
Field mmp(src);
Field r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
return;
}
std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations*1000; k++) {
c = cp;
MatrixTimer.Start();
Linop.HermOp(p, mmp);
MatrixTimer.Stop();
LinalgTimer.Start();
InnerTimer.Start();
ComplexD dc = innerProduct(p,mmp);
InnerTimer.Stop();
d = dc.real();
a = c / d;
AxpyNormTimer.Start();
cp = axpy_norm(r, -a, mmp, r);
AxpyNormTimer.Stop();
b = cp / c;
LinearCombTimer.Start();
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
vstream(psi[ss], a * p[ss] + psi[ss]);
vstream(p [ss], b * p[ss] + r[ss]);
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}
#endif

View File

@ -0,0 +1,154 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
namespace Grid {
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public:
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL){ };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.checkerboard;
sol_d.checkerboard = cb;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in._grid;
FieldD tmp_d(DoublePrecGrid);
tmp_d.checkerboard = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.checkerboard = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.checkerboard = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
zeroit(sol_f);
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
}
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
}
#endif

View File

@ -0,0 +1,322 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field>
{
public:
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
verbose=1;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
GridBase *grid = src._grid;
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
GridBase *grid = src._grid;
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
Field r(grid);
Field p(grid);
Field tmp(grid);
Field mmp(grid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src);
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl;
ps[s] = src;
}
// r and p for primary
r=src;
p=src;
//MdagM+m[0]
Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
// have verified that inner product of
// p and mmp is equal to d after this since
// the d computation is tricky
// qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r,b,mmp,r);
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer;
GridStopWatch ShiftTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p,a,p,r);
AXPYTimer.Stop();
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case.
//
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps[s],a,ps[s],r);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
MatrixTimer.Start();
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
// The below is faster on KNL
Linop.HermOp(p,mmp);
d=real(innerProduct(p,mmp));
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp,mass[0],p,mmp);
AXPYTimer.Stop();
RealD rn = norm2(p);
d += rn*mass[0];
bp=b;
b=-cp/d;
AXPYTimer.Start();
c=axpy_norm(r,b,mmp,r);
AXPYTimer.Stop();
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
for(int s=0;s<nshift;s++){
int ss = s;
// Scope for optimisation here in case of "single".
// Could load psi[0] and pull all ps[s] in.
// if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain:
//
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s].
// Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
if( (!converged[s]) ) {
axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
}
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop.HermOpAndNorm(psi[s],mmp,d,qq);
axpy(tmp,mass[s],psi[s],mmp);
axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r);
RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
}
#endif

View File

@ -0,0 +1,256 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
namespace Grid {
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
}
void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
psi.checkerboard = src.checkerboard;
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
return;
}
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.checkerboard = r.checkerboard;
precisionChange(r_f, r);
FieldF psi_f(r_f);
psi_f = zero;
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
psi_f = zero;
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
};
};
#endif

View File

@ -0,0 +1,111 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_RESIDUAL_H
#define GRID_CONJUGATE_RESIDUAL_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {
verbose=0;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b, c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
GridBase *grid = src._grid;
psi=zero;
Field r(grid), p(grid), Ap(grid), Ar(grid);
r=src;
p=src;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
}
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
}
#endif

View File

@ -0,0 +1,112 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_DEFLATION_H
#define GRID_DEFLATION_H
namespace Grid {
template<class Field>
class Guesser {
public:
Guesser(void) = default;
virtual ~Guesser(void) = default;
virtual void operator()(const Field &src, Field &guess) = 0;
};
template<class Field>
class ZeroGuesser: public Guesser<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = zero; };
};
template<class Field>
class SourceGuesser: public Guesser<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = src; };
};
////////////////////////////////
// Fine grid deflation
////////////////////////////////
template<class Field>
class DeflatedGuesser: public Guesser<Field> {
private:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) {
guess = zero;
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
}
guess.checkerboard = src.checkerboard;
}
};
template<class FineField, class CoarseField>
class LocalCoherenceDeflatedGuesser: public Guesser<FineField> {
private:
const std::vector<FineField> &subspace;
const std::vector<CoarseField> &evec_coarse;
const std::vector<RealD> &eval_coarse;
public:
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
: subspace(_subspace),
evec_coarse(_evec_coarse),
eval_coarse(_eval_coarse)
{
}
void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid);
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
}
blockPromote(guess_coarse,guess,subspace);
guess.checkerboard = src.checkerboard;
};
};
}
#endif

View File

@ -0,0 +1,842 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Chulwoo Jung <chulwoo@bnl.gov>
Author: Christoph Lehner <clehner@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BIRL_H
#define GRID_BIRL_H
#include <string.h> //memset
//#include <zlib.h>
#include <sys/stat.h>
namespace Grid {
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
parallel_region
{
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis[k]._odata[ss];
}
}
for(int j=j0; j<j1; ++j){
basis[j]._odata[ss] = B[j];
}
}
}
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
result.checkerboard = basis[0].checkerboard;
parallel_for(int ss=0;ss < grid->oSites();ss++){
vobj B = zero;
for(int k=k0; k<k1; ++k){
B +=Qt(j,k) * basis[k]._odata[ss];
}
result._odata[ss] = B;
}
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
template<class Field> class ImplicitlyRestartedLanczosTester
{
public:
virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
};
enum IRLdiagonalisation {
IRLdiagonaliseWithDSTEGR,
IRLdiagonaliseWithQR,
IRLdiagonaliseWithEigen
};
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
{
public:
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp) { };
int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
{
return TestConvergence(j,resid,B,eval,evalMaxApprox);
}
int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
{
Field v(B);
RealD eval_poly = eval;
// Apply operator
_HermOp(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
};
template<class Field>
class ImplicitlyRestartedLanczos {
private:
const RealD small = 1.0e-8;
int MaxIter;
int MinRestart; // Minimum number of restarts; only check for convergence after
int Nstop; // Number of evecs checked for convergence
int Nk; // Number of converged sought
// int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk
int Nm; // Nm -- total number of vectors
IRLdiagonalisation diagonalisation;
int orth_period;
RealD OrthoTime;
RealD eresid, betastp;
////////////////////////////////
// Embedded objects
////////////////////////////////
LinearFunction<Field> &_PolyOp;
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosTester<Field> &_Tester;
// Default tester provided (we need a ref to something in default case)
ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
/////////////////////////
// Constructor
/////////////////////////
public:
//////////////////////////////////////////////////////////////////
// PAB:
//////////////////////////////////////////////////////////////////
// Too many options & knobs.
// Eliminate:
// orth_period
// betastp
// MinRestart
//
// Do we really need orth_period
// What is the theoretical basis & guarantees of betastp ?
// Nstop=Nk viable?
// MinRestart avoidable with new convergence test?
// Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
// HermOp could be eliminated if we dropped the Power method for max eval.
// -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
//////////////////////////////////////////////////////////////////
ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
LinearFunction<Field> & HermOp,
ImplicitlyRestartedLanczosTester<Field> & Tester,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
LinearFunction<Field> & HermOp,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
////////////////////////////////
// Helpers
////////////////////////////////
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
void orthogonalize(Field& w, std::vector<Field>& evec,int k)
{
OrthoTime-=usecond()/1e6;
basisOrthogonalize(evec,w,k);
normalise(w);
OrthoTime+=usecond()/1e6;
}
/* Rudy Arthur's thesis pp.137
------------------------
Require: M > K P = M K †
Compute the factorization AVM = VM HM + fM eM
repeat
Q=I
for i = 1,...,P do
QiRi =HM θiI Q = QQi
H M = Q †i H M Q i
end for
βK =HM(K+1,K) σK =Q(M,K)
r=vK+1βK +rσK
VK =VM(1:M)Q(1:M,1:K)
HK =HM(1:K,1:K)
→AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
until convergence
*/
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{
GridBase *grid = src._grid;
assert(grid == evec[0]._grid);
GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" -- seek Nk = " << Nk <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- total Nm = " << Nm <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithQR ) {
std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
assert(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
{
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
normalise(src_n);
_HermOp(src_n,tmp);
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.05)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
src_n = tmp;
}
}
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);
std::vector<RealD> eval2(Nm);
std::vector<RealD> eval2_copy(Nm);
Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
Field f(grid);
Field v(grid);
int k1 = 1;
int k2 = Nk;
RealD beta_k;
Nconv = 0;
// Set initial vector
evec[0] = src;
normalise(evec[0]);
// Initial Nk steps
OrthoTime=0.;
for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// Restarting loop begins
//////////////////////////////////
int iter;
for(iter = 0; iter<MaxIter; ++iter){
OrthoTime=0.;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
f *= lme[Nm-1];
std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// getting eigenvalues
//////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k+k1-1];
lme2[k] = lme[k+k1-1];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
//////////////////////////////////
// sorting
//////////////////////////////////
eval2_copy = eval2;
std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
const int chunk=8;
for(int io=0; io<k2;io+=chunk){
std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
for(int ii=0;ii<chunk;ii++){
if ( (io+ii)<k2 )
std::cout<< " "<< std::setw(12)<< eval2[io+ii];
}
std::cout << std::endl;
}
//////////////////////////////////
// Implicitly shifted QR transformations
//////////////////////////////////
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
for(int ip=k2; ip<Nm; ++ip){
QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
}
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl;
////////////////////////////////////////////////////
// Compressed vector f and beta(k2)
////////////////////////////////////////////////////
f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2];
beta_k = norm2(f);
beta_k = sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k;
evec[k2] = betar * f;
lme[k2-1] = beta_k;
////////////////////////////////////////////////////
// Convergence test
////////////////////////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k];
lme2[k] = lme[k];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
Nconv = 0;
if (iter >= MinRestart) {
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.checkerboard = evec[0].checkerboard;
// power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1;
for(int jj = 1; jj<=Nstop; jj*=2){
int j = Nstop-jj;
RealD e = eval2_copy[j]; // Discard the evalue
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
allconv=0;
}
}
// Do evec[0] for good measure
{
int j=0;
RealD e = eval2_copy[0];
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
}
if ( allconv ) Nconv = Nstop;
// test if we converged, if so, terminate
std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
// if( Nconv>=Nstop || beta_k < betastp){
if( Nconv>=Nstop){
goto converged;
}
} else {
std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
} // end of iter loop
}
std::cout<<GridLogError<<"\n NOT converged.\n";
abort();
converged:
{
Field B(grid); B.checkerboard = evec[0].checkerboard;
basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0;
//////////////////////////////////////////////////////////////////////
// Full final convergence test; unconditionally applied
//////////////////////////////////////////////////////////////////////
for(int j = 0; j<=Nk; j++){
B=evec[j];
if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
Nconv++;
}
}
if ( Nconv < Nstop )
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
eval=eval2;
//Keep only converged
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
basisSortInPlace(evec,eval,reverse);
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << " -- Iterations = "<< iter << "\n";
std::cout << GridLogIRL << " -- beta(k) = "<< beta_k << "\n";
std::cout << GridLogIRL << " -- Nconv = "<< Nconv << "\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
}
private:
/* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do:
3. wk:=Avkβkv_{k1}
4. αk:=(wk,vk) //
5. wk:=wkαkvk // wk orthog vk
6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
7. vk+1 := wk/βk+1
8. EndDo
*/
void step(std::vector<RealD>& lmd,
std::vector<RealD>& lme,
std::vector<Field>& evec,
Field& w,int Nm,int k)
{
const RealD tiny = 1.0e-20;
assert( k< Nm );
GridStopWatch gsw_op,gsw_o;
Field& evec_k = evec[k];
_PolyOp(evec_k,w); std::cout<<GridLogIRL << "PolyOp" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
RealD alph = real(zalph);
w = w - alph * evec_k;// 5. wk:=wkαkvk
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
lmd[k] = alph;
lme[k] = beta;
if (k>0 && k % orth_period == 0) {
orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
}
if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
}
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt, // Nm x Nm
GridBase *grid)
{
Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
for(int i=0;i<Nk;i++) TriDiag(i,i) = lmd[i];
for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
for (int i = 0; i < Nk; i++) {
lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
}
for (int i = 0; i < Nk; i++) {
for (int j = 0; j < Nk; j++) {
Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
}
}
}
///////////////////////////////////////////////////////////////////////////
// File could end here if settle on Eigen ??? !!!
///////////////////////////////////////////////////////////////////////////
void QR_decomp(std::vector<RealD>& lmd, // Nm
std::vector<RealD>& lme, // Nm
int Nk, int Nm, // Nk, Nm
Eigen::MatrixXd& Qt, // Nm x Nm matrix
RealD Dsh, int kmin, int kmax)
{
int k = kmin-1;
RealD x;
RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
RealD c = ( lmd[k] -Dsh) *Fden;
RealD s = -lme[k] *Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
x =-s*lme[k+1];
lme[k+1] = c*lme[k+1];
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt(k,i);
RealD Qtmp2 = Qt(k+1,i);
Qt(k,i) = c*Qtmp1 - s*Qtmp2;
Qt(k+1,i)= s*Qtmp1 + c*Qtmp2;
}
// Givens transformations
for(int k = kmin; k < kmax-1; ++k){
RealD Fden = 1.0/hypot(x,lme[k-1]);
RealD c = lme[k-1]*Fden;
RealD s = - x*Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
lme[k-1] = c*lme[k-1] -s*x;
if(k != kmax-2){
x = -s*lme[k+1];
lme[k+1] = c*lme[k+1];
}
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt(k,i);
RealD Qtmp2 = Qt(k+1,i);
Qt(k,i) = c*Qtmp1 -s*Qtmp2;
Qt(k+1,i) = s*Qtmp1 +c*Qtmp2;
}
}
}
void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt,
GridBase *grid)
{
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
} else if ( diagonalisation == IRLdiagonaliseWithQR ) {
diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else {
assert(0);
}
}
#ifdef USE_LAPACK
void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
double *vl, double *vu, int *il, int *iu, double *abstol,
int *m, double *w, double *z, int *ldz, int *isuppz,
double *work, int *lwork, int *iwork, int *liwork,
int *info);
#endif
void diagonalize_lapack(std::vector<RealD>& lmd,
std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd& Qt,
GridBase *grid)
{
#ifdef USE_LAPACK
const int size = Nm;
int NN = Nk;
double evals_tmp[NN];
double evec_tmp[NN][NN];
memset(evec_tmp[0],0,sizeof(double)*NN*NN);
double DD[NN];
double EE[NN];
for (int i = 0; i< NN; i++) {
for (int j = i - 1; j <= i + 1; j++) {
if ( j < NN && j >= 0 ) {
if (i==j) DD[i] = lmd[i];
if (i==j) evals_tmp[i] = lmd[i];
if (j==(i-1)) EE[j] = lme[j];
}
}
}
int evals_found;
int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
int liwork = 3+NN*10 ;
int iwork[liwork];
double work[lwork];
int isuppz[2*NN];
char jobz = 'V'; // calculate evals & evecs
char range = 'I'; // calculate all evals
// char range = 'A'; // calculate all evals
char uplo = 'U'; // refer to upper half of original matrix
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
int ifail[NN];
int info;
int total = grid->_Nprocessors;
int node = grid->_processor;
int interval = (NN/total)+1;
double vl = 0.0, vu = 0.0;
int il = interval*node+1 , iu = interval*(node+1);
if (iu > NN) iu=NN;
double tol = 0.0;
if (1) {
memset(evals_tmp,0,sizeof(double)*NN);
if ( il <= NN){
LAPACK_dstegr(&jobz, &range, &NN,
(double*)DD, (double*)EE,
&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
&tol, // tolerance
&evals_found, evals_tmp, (double*)evec_tmp, &NN,
isuppz,
work, &lwork, iwork, &liwork,
&info);
for (int i = iu-1; i>= il-1; i--){
evals_tmp[i] = evals_tmp[i - (il-1)];
if (il>1) evals_tmp[i-(il-1)]=0.;
for (int j = 0; j< NN; j++){
evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
if (il>1) evec_tmp[i-(il-1)][j]=0.;
}
}
}
{
grid->GlobalSumVector(evals_tmp,NN);
grid->GlobalSumVector((double*)evec_tmp,NN*NN);
}
}
// Safer to sort instead of just reversing it,
// but the document of the routine says evals are sorted in increasing order.
// qr gives evals in decreasing order.
for(int i=0;i<NN;i++){
lmd [NN-1-i]=evals_tmp[i];
for(int j=0;j<NN;j++){
Qt((NN-1-i),j)=evec_tmp[i][j];
}
}
#else
assert(0);
#endif
}
void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt,
GridBase *grid)
{
int QRiter = 100*Nm;
int kmin = 1;
int kmax = Nk;
// (this should be more sophisticated)
for(int iter=0; iter<QRiter; ++iter){
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift)
// transformation
QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
// Convergence criterion (redef of kmin and kamx)
for(int j=kmax-1; j>= kmin; --j){
RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
if(fabs(lme[j-1])+dds > dds){
kmax = j+1;
goto continued;
}
}
QRiter = iter;
return;
continued:
for(int j=0; j<kmax-1; ++j){
RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
if(fabs(lme[j])+dds > dds){
kmin = j+1;
break;
}
}
}
std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
abort();
}
};
}
#endif

View File

@ -0,0 +1,406 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
Copyright (C) 2015
Author: Christoph Lehner <clehner@bnl.gov>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid {
struct LanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
ChebyParams, Cheby,/*Chebyshev*/
int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
int, Nk, /*Vecs in Lanczos seek converge*/
int, Nm, /*Total vecs in Lanczos include restart*/
RealD, resid, /*residual*/
int, MaxIt,
RealD, betastp, /* ? */
int, MinRes); // Must restart
};
struct LocalCoherenceLanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs,
bool, doFine,
bool, doFineRead,
bool, doCoarse,
bool, doCoarseRead,
LanczosParams, FineParams,
LanczosParams, CoarseParams,
ChebyParams, Smoother,
RealD , coarse_relax_tol,
std::vector<int>, blockSize,
std::string, config,
std::vector < std::complex<double> >, omega,
RealD, mass,
RealD, M5);
};
// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearOperatorBase<FineField> &_Linop;
std::vector<FineField> &subspace;
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace)
{
assert(subspace.size() >0);
};
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
OperatorFunction<FineField> & _poly;
LinearOperatorBase<FineField> &_Linop;
std::vector<FineField> &subspace;
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
LinearOperatorBase<FineField>& linop,
std::vector<FineField> & _subspace) :
_poly(poly),
_Linop(linop),
subspace(_subspace)
{ };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard =checkerboard;
FineField fout(FineGrid);fout.checkerboard =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
{ };
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
RealD eval_poly = eval;
// Apply operator
_Poly(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _subspace[0]._grid;
int checkerboard = _subspace[0].checkerboard;
FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard;
blockPromote(B,fv,_subspace);
_smoother(_Linop,fv,fB);
RealD eval_poly = eval;
_Linop.HermOp(fB,fv);
RealD vnum = real(innerProduct(fB,fv)); // HermOp.
RealD vden = norm2(fB);
RealD vv0 = norm2(fv);
eval = vnum/vden;
fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1;
return 0;
}
};
////////////////////////////////////////////
// Make serializable Lanczos params
////////////////////////////////////////////
template<class Fobj,class CComplex,int nbasis>
class LocalCoherenceLanczos
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<Fobj> FineField;
protected:
GridBase *_CoarseGrid;
GridBase *_FineGrid;
int _checkerboard;
LinearOperatorBase<FineField> & _FineOp;
std::vector<RealD> &evals_fine;
std::vector<RealD> &evals_coarse;
std::vector<FineField> &subspace;
std::vector<CoarseField> &evec_coarse;
private:
std::vector<RealD> _evals_fine;
std::vector<RealD> _evals_coarse;
std::vector<FineField> _subspace;
std::vector<CoarseField> _evec_coarse;
public:
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (_evals_fine),
evals_coarse(_evals_coarse),
subspace (_subspace),
evec_coarse(_evec_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
//////////////////////////////////////////////////////////////////////////
// Alternate constructore, external storage for use by Hadrons module
//////////////////////////////////////////////////////////////////////////
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard,
std::vector<FineField> &ext_subspace,
std::vector<CoarseField> &ext_coarse,
std::vector<RealD> &ext_eval_fine,
std::vector<RealD> &ext_eval_coarse
) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (ext_eval_fine),
evals_coarse(ext_eval_coarse),
subspace (ext_subspace),
evec_coarse (ext_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
};
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = ::sqrt(nn);
v = v * (1.0/nn);
return nn;
}
/*
void fakeFine(void)
{
int Nk = nbasis;
subspace.resize(Nk,_FineGrid);
subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard;
normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
subspace[k].checkerboard=_checkerboard;
Op(subspace[k-1],subspace[k]);
normalise(subspace[k]);
}
}
*/
void testFine(RealD resid)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
}
}
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
for(int k=0;k<evec_coarse.size();k++){
if ( k < nbasis ) {
assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
} else {
assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
}
}
}
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
assert(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp);
evals_fine.resize(Nm);
subspace.resize(Nm,_FineGrid);
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved
assert(Nstop>=nbasis);
assert(Nconv>=nbasis);
evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid);
}
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0;
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
assert(Nconv>=Nstop);
evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
}
}
};
}
#endif

View File

@ -0,0 +1,60 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/NormalEquations.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_NORMAL_EQUATIONS_H
#define GRID_NORMAL_EQUATIONS_H
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public OperatorFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver) {};
void operator() (const Field &in, Field &out){
Field src(in._grid);
_Matrix.Mdag(in,src);
_HermitianSolver(src,out); // Mdag M out = Mdag in
}
};
}
#endif

View File

@ -0,0 +1,119 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecConjugateResidual.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
#define GRID_PREC_CONJUGATE_RESIDUAL_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class PrecConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
LinearFunction<Field> &Preconditioner;
PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec)
{
verbose=1;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b, c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
GridBase *grid = src._grid;
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero;
r = src;
Preconditioner(r,p);
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Ar=Ap;
rAr=pAp;
rAAr=pAAp;
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=0;k<MaxIterations;k++){
Preconditioner(Ap,z);
RealD rq= real(innerProduct(Ap,z));
a = rAr/rq;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,z,r);
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
}
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
}
#endif

View File

@ -0,0 +1,230 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_H
#define GRID_PREC_GCR_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
namespace Grid {
template<class Field>
class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
verbose=1;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
psi=zero;
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src._grid);
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(Linop,src,psi,rsq);
std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.HermOp(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
return;
}
}
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
assert(0);
}
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
RealD cp;
RealD a, b, c, d;
RealD zAz, zAAz;
RealD rAq, rq;
GridBase *grid = src._grid;
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
r=src-Az;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOp(z,tmp);
MatTimer.Stop();
ttmp=tmp;
tmp=tmp-r;
/*
std::cout<<GridLogMessage<<r<<std::endl;
std::cout<<GridLogMessage<<z<<std::endl;
std::cout<<GridLogMessage<<ttmp<<std::endl;
std::cout<<GridLogMessage<<tmp<<std::endl;
*/
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
if((k==nstep-1)||(cp<rsq)){
return cp;
}
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop();
tmp=tmp-r;
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
}
assert(0); // never reached
return cp;
}
};
}
#endif

View File

@ -0,0 +1,503 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/SchurRedBlack.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_SCHUR_RED_BLACK_H
#define GRID_SCHUR_RED_BLACK_H
/*
* Red black Schur decomposition
*
* M = (Mee Meo) = (1 0 ) (Mee 0 ) (1 Mee^{-1} Meo)
* (Moe Moo) (Moe Mee^-1 1 ) (0 Moo-Moe Mee^-1 Meo) (0 1 )
* = L D U
*
* L^-1 = (1 0 )
* (-MoeMee^{-1} 1 )
* L^{dag} = ( 1 Mee^{-dag} Moe^{dag} )
* ( 0 1 )
* L^{-d} = ( 1 -Mee^{-dag} Moe^{dag} )
* ( 0 1 )
*
* U^-1 = (1 -Mee^{-1} Meo)
* (0 1 )
* U^{dag} = ( 1 0)
* (Meo^dag Mee^{-dag} 1)
* U^{-dag} = ( 1 0)
* (-Meo^dag Mee^{-dag} 1)
***********************
* M psi = eta
***********************
*Odd
* i) D_oo psi_o = L^{-1} eta_o
* eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
*
* Wilson:
* (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1} eta_o
* Stag:
* D_oo psi_o = L^{-1} eta = (eta_o - Moe Mee^{-1} eta_e)
*
* L^-1 eta_o= (1 0 ) (e
* (-MoeMee^{-1} 1 )
*
*Even
* ii) Mee psi_e + Meo psi_o = src_e
*
* => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
*
*
* TODO: Other options:
*
* a) change checkerboards for Schur e<->o
*
* Left precon by Moo^-1
* b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 = (D_oo)^dag M_oo^-dag Moo^-1 L^{-1} eta_o
* eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
*
* Right precon by Moo^-1
* c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1} eta_o
* eta_o' = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
* psi_o = M_oo^-1 phi_o
* TODO: Deflation
*/
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Now make the norm reflect extra factor of Mee
template<class Field> class SchurRedBlackStaggeredSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
//src_o = tmp; assert(src_o.checkerboard ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
guess(src_o, sol_o);
Mtmp = sol_o;
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called the Mpc solver" <<std::endl;
// Fionn A2A boolean behavioural control
if (subGuess) sol_o = sol_o-Mtmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagMooeeSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false) : _HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=cb;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
guess(src_o,sol_o);
Mtmp = sol_o;
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) sol_o = sol_o-Mtmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise = 0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix,class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
guess(src_o,tmp);
Mtmp = tmp;
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) tmp = tmp-Mtmp;
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoMixed {
private:
LinearFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
guess(src_o,tmp);
Mtmp = tmp;
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) tmp = tmp-Mtmp;
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
}
#endif

View File

@ -0,0 +1,125 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
namespace Grid {
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef _OPENMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
}

View File

@ -0,0 +1,315 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/AlignedAllocator.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
namespace Grid {
class PointerCache {
private:
static const int Ncache=8;
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl;\
}
#define profilerAllocate(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalAllocated += (bytes);\
s->currentlyAllocated += (bytes);\
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
#define profilerFree(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalFreed += (bytes);\
s->currentlyAllocated -= (bytes);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp>
class alignedAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
alignedAllocator() throw() { }
alignedAllocator(const alignedAllocator&) throw() { }
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
~alignedAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP
#pragma omp parallel for
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,commAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
}; // namespace Grid
#endif

View File

@ -0,0 +1,35 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Cartesian.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H
#include <Grid/cartesian/Cartesian_base.h>
#include <Grid/cartesian/Cartesian_full.h>
#include <Grid/cartesian/Cartesian_red_black.h>
#endif

View File

@ -0,0 +1,292 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/Cartesian_base.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H
namespace Grid{
//////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
//////////////////////////////////////////////////////////////////////
// unsigned long _ndimension;
// std::vector<int> _processors; // processor grid
// int _processor; // linear processor rank
// std::vector<int> _processor_coor; // linear processor rank
//////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread {
public:
int dummy;
// Give Lattice access
template<class object> friend class Lattice;
GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {};
virtual ~GridBase() = default;
// Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
std::vector<int> _ostride; // Outer stride for each dimension
std::vector<int> _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
std::vector<int> _slice_block;// subslice information
std::vector<int> _slice_stride;
std::vector<int> _slice_nblock;
std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded;
public:
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const std::vector<int> &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
virtual int CheckerBoardFromOindex (int Oindex)=0;
virtual int CheckerBoardFromOindexTable (int Oindex)=0;
//////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations
//////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
//
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously.
virtual int oIndex(std::vector<int> &coor)
{
int idx=0;
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
virtual int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline int oIndexReduced(std::vector<int> &ocoor)
{
int idx=0;
// ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx;
}
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
}
inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
//////////////////////////////////////////////////////////
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
{
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
}
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
inline int PermuteType(int dimension){
int permute_type=0;
//
// FIXME:
//
// Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just
// a permute.
//
// Cases: PermuteType == 1,2,4,8
// Distance should be either 0,1,2..
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
}
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) const { return _isites; };
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const std::vector<int> LocalStarts(void) { return _lstart; };
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
}
}
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
{
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
}
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
{
std::vector<int> pcoor;
std::vector<int> lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
std::vector<int> cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
}
*/
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
std::vector<int> coor(_ndimension);
ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
}
}
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
};
}
#endif

View File

@ -0,0 +1,174 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/Cartesian_full.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H
namespace Grid{
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase {
public:
int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(const std::vector<int> &site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0;
}
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
virtual ~GridCartesian() = default;
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_isCheckerBoarded = false;
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
// Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
};
};
}
#endif

View File

@ -0,0 +1,320 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/Cartesian_red_black.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_RED_BLACK_H
#define GRID_CARTESIAN_RED_BLACK_H
namespace Grid {
static const int CbRed =0;
static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
// Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase
{
public:
std::vector<int> _checker_dim_mask;
int _checker_dim;
std::vector<int> _checker_board;
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;
}
virtual int CheckerBoard(const std::vector<int> &site){
int linear=0;
assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d])
linear=linear+site[d];
}
return (linear&0x1);
}
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
int fulldim =_fdimensions[dim];
shift = (shift+fulldim)%fulldim;
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
}
virtual int CheckerBoardFromOindexTable (int Oindex) {
return _checker_board[Oindex];
}
virtual int CheckerBoardFromOindex (int Oindex)
{
std::vector<int> ocoor;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != _checker_dim) return shift;
int ocb=CheckerBoardFromOindex(osite);
return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
}
virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
if ( _checker_dim_mask[dim] ) {
// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
// does NOT cause a parity hop.
int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
if ( (shift+add) &0x1) {
return 1-source_cb;
} else {
return source_cb;
}
} else {
return source_cb;
}
};
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
std::vector<int> checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
virtual ~GridRedBlackCartesian() = default;
#if 0
////////////////////////////////////////////////////////////
// Create redblack grid ;; deprecate these. Should not
// need direct creation of redblack without a full grid to base on
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(processor_grid,*base)
{
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
////////////////////////////////////////////////////////////
// Create redblack grid
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid,*base)
{
std::vector<int> checker_dim_mask(dimensions.size(),1);
int checker_dim = 0;
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
#endif
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim)
{
_isCheckerBoarded = true;
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask = checker_dim_mask;
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
if (d == _checker_dim)
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
_ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid
_simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
assert(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for (int d = 0; d < _ndimension; d++)
{
rvol = rvol * _rdimensions[d];
}
_checker_board.resize(rvol);
for (int osite = 0; osite < _osites; osite++)
{
_checker_board[osite] = CheckerBoardFromOindex(osite);
}
};
protected:
virtual int oIndex(std::vector<int> &coor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
}
return idx;
};
virtual int iIndex(std::vector<int> &lcoor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
else
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
}
};
}
#endif

View File

@ -0,0 +1,34 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Communicator.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
#include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h>
#endif

View File

@ -0,0 +1,76 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_none.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/mman.h>
namespace Grid {
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
CartesianCommunicator::CommunicatorPolicy_t
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
int CartesianCommunicator::nCommThreads = -1;
/////////////////////////////////
// Grid information queries
/////////////////////////////////
int CartesianCommunicator::Dimensions(void) { return _ndimension; };
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
}

View File

@ -0,0 +1,207 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_base.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATOR_BASE_H
#define GRID_COMMUNICATOR_BASE_H
///////////////////////////////////
// Processor layout information
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
class CartesianCommunicator : public SharedMemory {
public:
////////////////////////////////////////////
// Policies
////////////////////////////////////////////
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
static int nCommThreads;
////////////////////////////////////////////
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
////////////////////////////////////////////////
// Must call in Grid startup
////////////////////////////////////////////////
static void Init(int *argc, char ***argv);
////////////////////////////////////////////////
// Constructors to sub-divide a parent communicator
// and default to comm world
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const std::vector<int> &pdimensions_in);
virtual ~CartesianCommunicator();
private:
////////////////////////////////////////////////
// Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private
////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
public:
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
int Dimensions(void) ;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
////////////////////////////////////////////////////////////
// Reduction
////////////////////////////////////////////////////////////
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &);
void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c);
void GlobalSumVector(ComplexD *c,int N);
void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &);
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o;
GlobalSumVector(ptr,words);
}
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void);
////////////////////////////////////////////////////////////
// Barrier
////////////////////////////////////////////////////////////
void Barrier(void);
////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger
////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes);
////////////////////////////////////////////////////////////
// All2All down one dimension
////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0);
assert(dim<_ndimension);
assert(in.size()==out.size());
int numnode = _processors[dim];
uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode;
assert(numnode * words == in.size());
assert(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
}
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);
void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes);
template<class obj> void Broadcast(int root,obj &data)
{
Broadcast(root,(void *)&data,sizeof(data));
};
};
}
#endif

View File

@ -0,0 +1,514 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
// First initialise of comms system
////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char ***argv)
{
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
assert(0);
}
Grid_quiesce_nodes();
// Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
///////////////////////////////////////////////////////////////////////////
// Use cartesian communicators now even in MPI3
///////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world
////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
MPI_Comm optimal_comm;
////////////////////////////////////////////////////
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////////
MPI_Comm_free(&optimal_comm);
}
//////////////////////////////////
// Try to subdivide communicator
//////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size();
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
std::vector<int> parent_processor_coor(_ndimension,0);
std::vector<int> parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
// int Nparent = parent._processors ;
// std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
int Nparent;
MPI_Comm_size(parent.communicator,&Nparent);
// std::cout << " Parent size "<<Nparent <<std::endl;
int childsize=1;
for(int d=0;d<processors.size();d++) {
childsize *= processors[d];
}
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
// std::cout << " child size "<<childsize <<std::endl;
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
int crank;
// Mpi uses the reverse Lexico convention to us; so reversed routines called
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
MPI_Comm comm_split;
if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
////////////////////////////////////////////////////////////////
// Split the communicator
////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0);
} else {
srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Set up from the new split communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
InitFromMPICommunicator(processors,comm_split);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Take the right SHM buffers
//////////////////////////////////////////////////////////////////////////////////////////////////////
SetCommunicator(comm_split);
///////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////
MPI_Comm_free(&comm_split);
if(0){
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
for(int d=0;d<processors.size();d++){
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
}
}
for(int d=0;d<processors.size();d++){
assert(_processor_coor[d] == ccoor[d] );
}
}
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
{
////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo
////////////////////////////////////////////////////
_ndimension = processors.size();
_processor_coor.resize(_ndimension);
/////////////////////////////////
// Count the requested nodes
/////////////////////////////////
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
std::vector<int> periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
if ( 0 && (communicator_base != communicator_world) ) {
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
for(int d=0;d<_processors.size();d++){
std::cout << _processor_coor[d]<<" ";
}
std::cout << std::endl;
}
int Size;
MPI_Comm_size(communicator,&Size);
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
assert(Size==_Nprocessors);
}
CartesianCommunicator::~CartesianCommunicator()
{
int MPI_is_finalised;
MPI_Finalized(&MPI_is_finalised);
if (communicator && !MPI_is_finalised) {
MPI_Comm_free(&communicator);
for(int i=0;i<communicator_halo.size();i++){
MPI_Comm_free(&communicator_halo[i]);
}
}
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
std::vector<int> row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
row[dim] = _processors[dim];
int me;
CartesianCommunicator Comm(row,*this,me);
Comm.AllToAll(in,out,words,bytes);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object;
int iwords;
int ibytes;
iwords = words;
ibytes = bytes;
assert(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
MPI_Type_free(&object);
}
}

View File

@ -0,0 +1,165 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_none.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char *** arv)
{
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
{
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_processors = processors;
_ndimension = processors.size();
_processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake
_Nprocessors=1;
_processor = 0;
for(int d=0;d<_ndimension;d++) {
assert(_processors[d]==1);
_processor_coor[d] = 0;
}
SetCommunicator(communicator_world);
}
CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){}
void CartesianCommunicator::GlobalSum(uint32_t &){}
void CartesianCommunicator::GlobalSum(uint64_t &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
}
int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
source =0;
dest=0;
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void){};
}

View File

@ -0,0 +1,92 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
// static data
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup;
int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
int GlobalSharedMemory::WorldShmSize;
std::vector<int> GlobalSharedMemory::WorldShmRanks;
Grid_MPI_Comm GlobalSharedMemory::WorldComm;
int GlobalSharedMemory::WorldSize;
int GlobalSharedMemory::WorldRank;
int GlobalSharedMemory::WorldNodes;
int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void)
{
assert(_ShmAlloc);
assert(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes);
}
_ShmAlloc = 0;
_ShmAllocBytes = 0;
}
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
heap_top += bytes;
heap_bytes+= bytes;
if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size);
}
return ptr;
}
void SharedMemory::ShmBufferFreeAll(void) {
heap_top =(size_t)ShmBufferSelf();
heap_bytes=0;
}
void *SharedMemory::ShmBufferSelf(void)
{
return ShmCommBufs[ShmRank];
}
}

View File

@ -0,0 +1,165 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once
#include <Grid/GridCore.h>
#if defined (GRID_COMMS_MPI3)
#include <mpi.h>
#endif
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
namespace Grid {
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
class GlobalSharedMemory {
private:
static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation
static int _ShmSetup;
static int _ShmAlloc;
static uint64_t _ShmAllocBytes;
public:
static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
static uint64_t MAX_MPI_SHM_BYTES;
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
static Grid_MPI_Comm WorldShmComm;
static int WorldShmRank;
static int WorldShmSize;
static int WorldNodes;
static int WorldNode;
static std::vector<int> WorldShmRanks;
//////////////////////////////////////////////////////////////////////////////////////
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
};
//////////////////////////////
// one per communicator
//////////////////////////////
class SharedMemory
{
private:
static const int MAXLOG2RANKSPERNODE = 16;
size_t heap_top;
size_t heap_bytes;
size_t heap_size;
protected:
Grid_MPI_Comm ShmComm; // for barriers
int ShmRank;
int ShmSize;
std::vector<void *> ShmCommBufs;
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
public:
SharedMemory() {};
~SharedMemory();
///////////////////////////////////////////////////////////////////////////////////////
// set the buffers & sizes
///////////////////////////////////////////////////////////////////////////////////////
void SetCommunicator(Grid_MPI_Comm comm);
////////////////////////////////////////////////////////////////////////
// For this instance ; disjoint buffer sets between splits if split grid
////////////////////////////////////////////////////////////////////////
void ShmBarrier(void);
///////////////////////////////////////////////////
// Call on any instance
///////////////////////////////////////////////////
void SharedMemoryTest(void);
void *ShmBufferSelf(void);
void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////
int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
int RankCount(void) { return GlobalSharedMemory::WorldSize;};
};
}

View File

@ -0,0 +1,652 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <pwd.h>
namespace Grid {
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize);
// WorldComm, WorldSize, WorldRank
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
// WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes
WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ?
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (WorldComm, &WorldGroup);
MPI_Comm_group (WorldShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize); for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
WorldShmRanks.resize(WorldSize);
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and nominate the leader
///////////////////////////////////////////////////////////////////
int g=0;
std::vector<int> MyGroup;
MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){
assert(g<WorldShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(WorldNodes,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the node of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
WorldNode=-1;
for(int g=0;g<WorldNodes;g++){
if (myleader == leaders_group[g]){
WorldNode=g;
}
}
assert(WorldNode!=-1);
_ShmSetup=1;
}
// Gray encode support
int BinaryToGray (int binary) {
int gray = (binary>>1)^binary;
return gray;
}
int Log2Size(int TwoToPower,int MAXLOG2)
{
int log2size = -1;
for(int i=0;i<=MAXLOG2;i++){
if ( (0x1<<i) == TwoToPower ) {
log2size = i;
break;
}
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
#undef HYPERCUBE
#ifdef HYPERCUBE
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname
////////////////////////////////////////////////////////////////
// n runs 0...7 9...16 18...25 27...34 (8*4) 5 bits
// i runs 0..7 3 bits
// r runs 0..3 2 bits
// 2^10 = 1024 nodes
const int maxhdim = 10;
std::vector<int> HyperCubeCoords(maxhdim,0);
std::vector<int> RootHyperCubeCoords(maxhdim,0);
int R;
int I;
int N;
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
// Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3);
int nlo = N%9;
int nhi = N/9;
uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
uint32_t rootcoor = hypercoor;
//////////////////////////////////////////////////////////////////
// Print debug info
//////////////////////////////////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
std::string hname(name);
std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N<<
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
//////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition.
//////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize);
assert(hypercoor>=0);
//////////////////////////////////////
// Printing
//////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Map Hcube according to physical lattice
// must partition. Loop over dims and find out who would join.
////////////////////////////////////////////////////////////////
int hcoor = hypercoor;
for(int d=0;d<ndimension;d++){
int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
int msk = (0x1<<bits)-1;
HyperCoor[d]=hcoor & msk;
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
hcoor = hcoor >> bits;
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#else
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
std::vector<int> shmids(WorldShmSize);
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
key_t key = IPC_PRIVATE;
int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
}
}
}
MPI_Barrier(WorldShmComm);
MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
MPI_Barrier(WorldShmComm);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
if (WorldShmCommBufs[r] == (uint64_t *)-1) {
perror("Shared memory attach failure");
shmctl(shmids[r], IPC_RMID, NULL);
exit(2);
}
}
MPI_Barrier(WorldShmComm);
///////////////////////////////////
// Mark for clean up
///////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
}
MPI_Barrier(WorldShmComm);
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
for(int r=0;r<WorldShmSize;r++){
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd == -1) {
printf("open %s failed\n",shm_name);
perror("open hugetlbfs");
exit(0);
}
int mmap_flag = MAP_SHARED ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
int fd=-1;
int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHMOPEN
////////////////////////////////////////////////////////////////////////////////////////////
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
// the posix shm virtual file system
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
char shm_name [NAME_MAX];
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size);
int mmap_flag = MAP_SHARED;
#ifdef MAP_POPULATE
mmap_flag |= MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if (flags) mmap_flag |= MAP_HUGETLB;
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
MPI_Barrier(WorldShmComm);
if ( WorldShmRank != 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes ;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
int fd=shm_open(shm_name,O_RDWR,0666);
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
int rank, size;
MPI_Comm_rank(comm,&rank);
MPI_Comm_size(comm,&size);
ShmRanks.resize(size);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize);
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
assert (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){
uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
}
ShmBufferFreeAll();
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group FullGroup, ShmGroup;
MPI_Comm_group (comm , &FullGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void)
{
MPI_Barrier (ShmComm);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void)
{
ShmBarrier();
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GlobalSharedMemory::WorldNode;
check[1] = r;
check[2] = 0x5A5A5A;
}
}
ShmBarrier();
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==0x5A5A5A);
}
ShmBarrier();
}
void *SharedMemory::ShmBuffer(int rank)
{
int gpeer = ShmRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
static int count =0;
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
return (void *) remote;
}
}
SharedMemory::~SharedMemory()
{
int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised);
if ( !MPI_is_finalised ) {
MPI_Comm_free(&ShmComm);
}
};
}

View File

@ -0,0 +1,128 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = 0;
WorldRank = 0;
WorldSize = 1;
WorldShmComm = 0 ;
WorldShmRank = 0 ;
WorldShmSize = 1 ;
WorldNodes = 1 ;
WorldNode = 0 ;
WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
WorldShmCommBufs.resize(1);
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
optimal_comm = WorldComm;
}
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
int mmap_flag =0;
#ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
#endif
#ifdef MAP_ANON
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) {
perror("mmap failed ");
exit(EXIT_FAILURE);
}
#ifdef MADV_HUGEPAGE
if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
#endif
bzero(ShmCommBuf,bytes);
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
};
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
assert(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1);
ShmCommBufs.resize(1);
ShmRanks[0] = 0;
ShmRank = 0;
ShmSize = 1;
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
heap_size = GlobalSharedMemory::ShmAllocBytes();
ShmBufferFreeAll();
return;
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void){ return ; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void) { return; }
void *SharedMemory::ShmBuffer(int rank)
{
return NULL;
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
return NULL;
}
SharedMemory::~SharedMemory()
{};
}

52
Grid/cshift/Cshift.h Normal file
View File

@ -0,0 +1,52 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Cshift.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_H_
#define _GRID_CSHIFT_H_
#include <Grid/cshift/Cshift_common.h>
#ifdef GRID_COMMS_NONE
#include <Grid/cshift/Cshift_none.h>
#endif
#ifdef GRID_COMMS_MPI
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_MPI3
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_MPIT
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif
#endif

391
Grid/cshift/Cshift_common.h Normal file
View File

@ -0,0 +1,391 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cshift/Cshift_common.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_COMMON_H_
#define _GRID_CSHIFT_COMMON_H_
namespace Grid {
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int ent = 0;
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
}
}
}
}
parallel_for(int i=0;i<ent;i++){
buffer[table[i].first]=rhs._odata[table[i].second];
}
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension];
if ( cbmask ==0x3){
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs._odata[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl;
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs._odata[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
}
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0;
if ( cbmask ==0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++);
}
}
}
}
parallel_for(int i=0;i<ent;i++){
rhs._odata[table[i].first]=buffer[table[i].second];
}
}
//////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset);
}
}
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset);
}
}
}
}
}
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
}
}
parallel_for(int i=0;i<ent;i++){
lhs._odata[table[i].first]=rhs._odata[table[i].second];
}
}
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
double t_tab,t_perm;
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
}
parallel_for(int i=0;i<ent;i++){
permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
}
}
//////////////////////////////////////////////////////
// Local to node Cshift
//////////////////////////////////////////////////////
template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
double t_local;
if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3);
} else {
Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid = rhs._grid;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int gd = grid->_gdimensions[dimension];
int ly = grid->_simd_layout[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
// the permute type
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension);
int permute_type_dist;
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
// wrap is whether sshift > rd.
// num is sshift mod rd.
//
// shift 7
//
// XoXo YcYc
// oXoX cYcY
// XoXo YcYc
// oXoX cYcY
//
// sshift --
//
// XX YY ; 3
// XX YY ; 0
// XX YY ; 3
// XX YY ; 0
//
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd; wrap=wrap % ly;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = (wrap+1)%ly;
if ( (ly>2) && (permute_slice) ) {
assert(permute_type & RotateBit);
permute_type_dist = permute_type|permute_slice;
} else {
permute_type_dist = permute_type;
}
}
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
}
}
#endif

262
Grid/cshift/Cshift_mpi.h Normal file
View File

@ -0,0 +1,262 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cshift/Cshift_mpi.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_
namespace Grid {
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
// the permute type
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
if ( !comm_dim ) {
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
return ret;
}
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3);
} else {
// std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
//std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
//std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs._grid;
Lattice<vobj> temp(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int pd = rhs._grid->_processors[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = send_buf.size();
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs._grid;
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); //
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
}
#endif

39
Grid/cshift/Cshift_none.h Normal file
View File

@ -0,0 +1,39 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cshift/Cshift_none.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_NONE_H_
#define _GRID_CSHIFT_NONE_H_
namespace Grid {
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
Cshift_local(ret,rhs,dimension,shift);
return ret;
}
}
#endif

14750
Grid/json/json.hpp Normal file

File diff suppressed because it is too large Load Diff

33
Grid/lattice/Lattice.h Normal file
View File

@ -0,0 +1,33 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Lattice.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_H
#define GRID_LATTICE_H
#include <Grid/lattice/Lattice_base.h>
#endif

466
Grid/lattice/Lattice_ET.h Normal file
View File

@ -0,0 +1,466 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_ET.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_ET_H
#define GRID_LATTICE_ET_H
#include <iostream>
#include <tuple>
#include <typeinfo>
#include <vector>
namespace Grid {
////////////////////////////////////////////////////
// Predicated where support
////////////////////////////////////////////////////
template <class iobj, class vobj, class robj>
inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const robj &iffalse) {
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
const int words = sizeof(vobj) / sizeof(vector_type);
std::vector<Integer> mask(Nsimd);
std::vector<scalar_object> truevals(Nsimd);
std::vector<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
////////////////////////////////////////////
// recursive evaluation of expressions; Could
// switch to generic approach with variadics, a la
// Antonin's Lat Sim but the repack to variadic with popped
// from tuple is hideous; C++14 introduces std::make_index_sequence for this
////////////////////////////////////////////
// leaf eval of lattice ; should enable if protect using traits
template <typename T>
using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T>
using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
//Specialization of getVectorType for lattices
template<typename T>
struct getVectorType<Lattice<T> >{
typedef typename Lattice<T>::vector_object type;
};
template<class sobj>
inline sobj eval(const unsigned int ss, const sobj &arg)
{
return arg;
}
template <class lobj>
inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
return arg._odata[ss];
}
// handle nodes in syntax tree
template <typename Op, typename T1>
auto inline eval(
const unsigned int ss,
const LatticeUnaryExpression<Op, T1> &expr) // eval one operand
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)));
}
template <typename Op, typename T1, typename T2>
auto inline eval(
const unsigned int ss,
const LatticeBinaryExpression<Op, T1, T2> &expr) // eval two operands
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)));
}
template <typename Op, typename T1, typename T2, typename T3>
auto inline eval(const unsigned int ss,
const LatticeTrinaryExpression<Op, T1, T2, T3>
&expr) // eval three operands
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)));
}
//////////////////////////////////////////////////////////////////////////
// Obtain the grid from an expression, ensuring conformable. This must follow a
// tree recursion
//////////////////////////////////////////////////////////////////////////
template <class T1,
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
{
if (grid) {
conformable(grid, lat._grid);
}
grid = lat._grid;
}
template <class T1,
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid,
const T1 &notlat) // non-lattice leaf
{}
template <typename Op, typename T1>
inline void GridFromExpression(GridBase *&grid,
const LatticeUnaryExpression<Op, T1> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse
}
template <typename Op, typename T1, typename T2>
inline void GridFromExpression(
GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, std::get<1>(expr.second));
}
template <typename Op, typename T1, typename T2, typename T3>
inline void GridFromExpression(
GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, std::get<1>(expr.second));
GridFromExpression(grid, std::get<2>(expr.second));
}
//////////////////////////////////////////////////////////////////////////
// Obtain the CB from an expression, ensuring conformable. This must follow a
// tree recursion
//////////////////////////////////////////////////////////////////////////
template <class T1,
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{
if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.checkerboard);
}
cb = lat.checkerboard;
// std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
}
template <class T1,
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{
// std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
}
template <typename Op, typename T1>
inline void CBFromExpression(int &cb,
const LatticeUnaryExpression<Op, T1> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
// std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
}
template <typename Op, typename T1, typename T2>
inline void CBFromExpression(int &cb,
const LatticeBinaryExpression<Op, T1, T2> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
CBFromExpression(cb, std::get<1>(expr.second));
// std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
}
template <typename Op, typename T1, typename T2, typename T3>
inline void CBFromExpression(
int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
CBFromExpression(cb, std::get<1>(expr.second));
CBFromExpression(cb, std::get<2>(expr.second));
// std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
}
////////////////////////////////////////////
// Unary operators and funcs
////////////////////////////////////////////
#define GridUnopClass(name, ret) \
template <class arg> \
struct name { \
static auto inline func(const arg a) -> decltype(ret) { return ret; } \
};
GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryReal, real(a));
GridUnopClass(UnaryImag, imag(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a));
GridUnopClass(UnaryAcos, acos(a));
GridUnopClass(UnaryLog, log(a));
GridUnopClass(UnaryExp, exp(a));
////////////////////////////////////////////
// Binary operators
////////////////////////////////////////////
#define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \
static auto inline func(const left &lhs, const right &rhs) \
-> decltype(combination) const { \
return combination; \
} \
}
GridBinOpClass(BinaryAdd, lhs + rhs);
GridBinOpClass(BinarySub, lhs - rhs);
GridBinOpClass(BinaryMul, lhs *rhs);
GridBinOpClass(BinaryDiv, lhs /rhs);
GridBinOpClass(BinaryAnd, lhs &rhs);
GridBinOpClass(BinaryOr, lhs | rhs);
GridBinOpClass(BinaryAndAnd, lhs &&rhs);
GridBinOpClass(BinaryOrOr, lhs || rhs);
////////////////////////////////////////////////////
// Trinary conditional op
////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \
static auto inline func(const predicate &pred, const left &lhs, \
const right &rhs) -> decltype(combination) const { \
return combination; \
} \
}
GridTrinOpClass(
TrinaryWhere,
(predicatedWhere<predicate, typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs,
rhs)));
////////////////////////////////////////////
// Operator syntactical glue
////////////////////////////////////////////
#define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) \
name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \
template <typename T1, \
typename std::enable_if<is_lattice<T1>::value || \
is_lattice_expr<T1>::value, \
T1>::type * = nullptr> \
inline auto op(const T1 &arg) \
->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg))); \
}
#define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value || \
is_lattice_expr<T1>::value, \
T1>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype( \
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
}
#define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value && \
!is_lattice_expr<T1>::value, \
T1>::type * = nullptr, \
typename std::enable_if<is_lattice<T2>::value || \
is_lattice_expr<T2>::value, \
T2>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype( \
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
}
#define GRID_DEF_BINOP(op, name) \
GRID_BINOP_LEFT(op, name); \
GRID_BINOP_RIGHT(op, name);
#define GRID_DEF_TRINOP(op, name) \
template <typename T1, typename T2, typename T3> \
inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
->decltype( \
LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) { \
return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs))); \
}
////////////////////////
// Operator definitions
////////////////////////
GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(adj, UnaryAdj);
GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(real, UnaryReal);
GRID_DEF_UNOP(imag, UnaryImag);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin);
GRID_DEF_UNOP(acos, UnaryAcos);
GRID_DEF_UNOP(log, UnaryLog);
GRID_DEF_UNOP(exp, UnaryExp);
GRID_DEF_BINOP(operator+, BinaryAdd);
GRID_DEF_BINOP(operator-, BinarySub);
GRID_DEF_BINOP(operator*, BinaryMul);
GRID_DEF_BINOP(operator/, BinaryDiv);
GRID_DEF_BINOP(operator&, BinaryAnd);
GRID_DEF_BINOP(operator|, BinaryOr);
GRID_DEF_BINOP(operator&&, BinaryAndAnd);
GRID_DEF_BINOP(operator||, BinaryOrOr);
GRID_DEF_TRINOP(where, TrinaryWhere);
/////////////////////////////////////////////////////////////
// Closure convenience to force expression to evaluate
/////////////////////////////////////////////////////////////
template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
expr);
return ret;
}
template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second))))> {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second))))>
ret(expr);
return ret;
}
template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second)),
eval(0, std::get<2>(expr.second))))> {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second)),
eval(0, std::get<2>(expr.second))))>
ret(expr);
return ret;
}
#undef GRID_UNOP
#undef GRID_BINOP
#undef GRID_TRINOP
#undef GRID_DEF_UNOP
#undef GRID_DEF_BINOP
#undef GRID_DEF_TRINOP
}
#if 0
using namespace Grid;
int main(int argc,char **argv){
Lattice<double> v1(16);
Lattice<double> v2(16);
Lattice<double> v3(16);
BinaryAdd<double,double> tmp;
LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &>
expr(std::make_pair(tmp,
std::forward_as_tuple(v1,v2)));
tmp.func(eval(0,v1),eval(0,v2));
auto var = v1+v2;
std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
v3=v1+v2;
v3=v1+v2+v1*v2;
};
void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
{
v3=v1+v2+v1*v2;
}
#endif
#endif

View File

@ -0,0 +1,255 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_arith.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_ARITH_H
#define GRID_LATTICE_ARITH_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class sobj,class vobj> strong_inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
#endif
}
}
template<class sobj,class vobj> strong_inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
#endif
}
}
template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
return axpby_norm_fast(ret,a,b,x,y);
}
}
#endif

375
Grid/lattice/Lattice_base.h Normal file
View File

@ -0,0 +1,375 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_base.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_BASE_H
#define GRID_LATTICE_BASE_H
#define STREAMING_STORES
namespace Grid {
// TODO:
// mac,real,imag
// Functionality:
// -=,+=,*=,()
// add,+,sub,-,mult,mac,*
// adj,conjugate
// real,imag
// transpose,transposeIndex
// trace,traceIndex
// peekIndex
// innerProduct,outerProduct,
// localNorm2
// localInnerProduct
extern int GridCshiftPermuteMap[4][16];
////////////////////////////////////////////////
// Basic expressions used in Expression Template
////////////////////////////////////////////////
class LatticeBase
{
public:
virtual ~LatticeBase(void) = default;
GridBase *_grid;
};
class LatticeExpressionBase {};
template <typename Op, typename T1>
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
public:
LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {};
};
template <typename Op, typename T1, typename T2>
class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase {
public:
LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {};
};
template <typename Op, typename T1, typename T2, typename T3>
class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase {
public:
LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
};
void inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
template<class vobj>
class Lattice : public LatticeBase
{
public:
int checkerboard;
Vector<vobj> _odata;
// to pthread need a computable loop where loop induction is not required
int begin(void) { return 0;};
int end(void) { return _odata.size(); }
vobj & operator[](int i) { return _odata[i]; };
const vobj & operator[](int i) const { return _odata[i]; };
public:
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef vobj vector_object;
////////////////////////////////////////////////////////////////////////////////
// Expression Template closure support
////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
return *this;
}
template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
return *this;
}
template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
//vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,eval(ss,expr));
#else
_odata[ss] = eval(ss,expr);
#endif
}
return *this;
}
//GridFromExpression is tricky to do
template<class Op,class T1>
Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
_grid = nullptr;
GridFromExpression(_grid,expr);
assert(_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
_odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
};
template<class Op,class T1, class T2>
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
_grid = nullptr;
GridFromExpression(_grid,expr);
assert(_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
_odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
};
template<class Op,class T1, class T2, class T3>
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
_grid = nullptr;
GridFromExpression(_grid,expr);
assert(_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
checkerboard=cb;
_odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
vstream(_odata[ss] ,eval(ss,expr));
}
};
//////////////////////////////////////////////////////////////////
// Constructor requires "grid" passed.
// what about a default grid?
//////////////////////////////////////////////////////////////////
Lattice(GridBase *grid) : _odata(grid->oSites()) {
_grid = grid;
// _odata.reserve(_grid->oSites());
// _odata.resize(_grid->oSites());
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
assert((((uint64_t)&_odata[0])&0xF) ==0);
checkerboard=0;
}
Lattice(const Lattice& r){ // copy constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
Lattice(Lattice&& r){ // move constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata=std::move(r._odata);
}
inline Lattice<vobj> & operator = (Lattice<vobj> && r)
{
_grid = r._grid;
checkerboard = r.checkerboard;
_odata =std::move(r._odata);
return *this;
}
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
virtual ~Lattice(void) = default;
void reset(GridBase* grid) {
if (_grid != grid) {
_grid = grid;
_odata.resize(grid->oSites());
checkerboard = 0;
}
}
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r;
}
return *this;
}
// *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r;
return *this;
}
template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r;
return *this;
}
template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r;
return *this;
}
}; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
std::vector<int> gcoor;
typedef typename vobj::scalar_object sobj;
sobj ss;
for(int g=0;g<o._grid->_gsites;g++){
o._grid->GlobalIndexToGlobalCoor(g,gcoor);
peekSite(ss,o,gcoor);
stream<<"[";
for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
return stream;
}
}
#include "Lattice_conformable.h"
#define GRID_LATTICE_EXPRESSION_TEMPLATES
#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES
#include "Lattice_ET.h"
#else
#include "Lattice_overload.h"
#endif
#include "Lattice_arith.h"
#include "Lattice_trace.h"
#include "Lattice_transpose.h"
#include "Lattice_local.h"
#include "Lattice_reduction.h"
#include "Lattice_peekpoke.h"
#include "Lattice_reality.h"
#include "Lattice_comparison_utils.h"
#include "Lattice_comparison.h"
#include "Lattice_coordinate.h"
#include "Lattice_where.h"
#include "Lattice_rng.h"
#include "Lattice_unary.h"
#include "Lattice_transfer.h"
#endif

View File

@ -0,0 +1,169 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_comparison.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_COMPARISON_H
#define GRID_LATTICE_COMPARISON_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////
// relational operators
//
// Support <,>,<=,>=,==,!=
//
//Query supporting bitwise &, |, ^, !
//Query supporting logical &&, ||,
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
// compare lattice to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare lattice to scalar
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vInteger> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare scalar to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// Map to functors
//////////////////////////////////////////////////////////////////////////
// Less than
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
}
#endif

View File

@ -0,0 +1,232 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_comparison_utils.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMPARISON_H
#define GRID_COMPARISON_H
namespace Grid {
/////////////////////////////////////////
// This implementation is a bit poor.
//
// Only support relational logical operations (<, > etc)
// on scalar objects. Therefore can strip any tensor structures.
//
// Should guard this with isGridTensor<> enable if?
/////////////////////////////////////////
//
// Generic list of functors
//
template<class lobj,class robj> class veq {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class vne {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class vlt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class vle {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class vgt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class vge {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
};
// Generic list of functors
template<class lobj,class robj> class seq {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class sne {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class slt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class sle {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class sgt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class sge {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
};
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Integer and real get extra relational functions.
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<scalar> vrhs(vsimd::Nsimd());
std::vector<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(lhs,vlhs);
extract<vsimd,scalar>(rhs,vrhs);
for(int s=0;s<vsimd::Nsimd();s++){
vpred[s] = sop(vlhs[s],vrhs[s]);
}
merge<vInteger,Integer>(ret,vpred);
return ret;
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(lhs,vlhs);
for(int s=0;s<vsimd::Nsimd();s++){
vpred[s] = sop(vlhs[s],rhs);
}
merge<vInteger,Integer>(ret,vpred);
return ret;
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(rhs,vrhs);
for(int s=0;s<vsimd::Nsimd();s++){
vpred[s] = sop(lhs,vrhs[s]);
}
merge<vInteger,Integer>(ret,vpred);
return ret;
}
#define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \
return lhs._internal op rhs; \
} \
template<class vsimd>\
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \
return lhs op rhs._internal; \
} \
#define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
}
DECLARE_RELATIONAL(<,slt);
DECLARE_RELATIONAL(<=,sle);
DECLARE_RELATIONAL(>,sgt);
DECLARE_RELATIONAL(>=,sge);
DECLARE_RELATIONAL_EQ(==,seq);
DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL
}
#endif

View File

@ -0,0 +1,40 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_conformable.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_CONFORMABLE_H
#define GRID_LATTICE_CONFORMABLE_H
namespace Grid {
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs._grid == rhs._grid);
assert(lhs.checkerboard == rhs.checkerboard);
}
}
#endif

View File

@ -0,0 +1,56 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_coordinate.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_COORDINATE_H
#define GRID_LATTICE_COORDINATE_H
namespace Grid {
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
{
typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type;
GridBase *grid = l._grid;
int Nsimd = grid->iSites();
std::vector<int> gcoor;
std::vector<scalar_type> mergebuf(Nsimd);
vector_type vI;
for(int o=0;o<grid->oSites();o++){
for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu];
}
merge<vector_type,scalar_type>(vI,mergebuf);
l._odata[o]=vI;
}
};
}
#endif

View File

@ -0,0 +1,75 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_local.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_LOCALREDUCTION_H
#define GRID_LATTICE_LOCALREDUCTION_H
///////////////////////////////////////////////
// localInner, localNorm, outerProduct
///////////////////////////////////////////////
namespace Grid {
/////////////////////////////////////////////////////
// Non site, reduced locally reduced routines
/////////////////////////////////////////////////////
// localNorm2,
template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
}
#endif

View File

@ -0,0 +1,138 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_overload.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_OVERLOAD_H
#define GRID_LATTICE_OVERLOAD_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// unary negation
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
Lattice<vobj> ret(r._grid);
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
vstream(ret._odata[ss], -r._odata[ss]);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////
// Lattice BinOp Lattice,
//NB mult performs conformable check. Do not reapply here for performance.
/////////////////////////////////////////////////////////////////////////////////////
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
mult(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
// Scalar BinOp Lattice ;generate return type
template<class left,class right>
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs*rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs+rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
}
return ret;
}
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]*rhs;
}
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]+rhs;
}
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
}
#endif

View File

@ -0,0 +1,205 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_peekpoke.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_PEEK_H
#define GRID_LATTICE_PEEK_H
///////////////////////////////////////////////
// Peeking and poking around
///////////////////////////////////////////////
namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////
// Peek internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
}
return ret;
};
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Poke internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
{
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
}
}
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
{
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
}
}
//////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array
//////////////////////////////////////////////////////
template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
// Optional to broadcast from node 0.
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s);
std::vector<sobj> buf(Nsimd);
// extract-modify-merge cycle is easiest way and this is not perf critical
if ( rank == grid->ThisRank() ) {
extract(l._odata[odx],buf);
buf[idx] = s;
merge(l._odata[odx],buf);
}
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard == l._grid->CheckerBoard(site));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
std::vector<sobj> buf(Nsimd);
extract(l._odata[odx],buf);
s = buf[idx];
grid->Broadcast(rank,s);
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid = l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
}
return;
};
template<class vobj,class sobj>
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
}
return;
};
}
#endif

View File

@ -0,0 +1,57 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reality.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REALITY_H
#define GRID_LATTICE_REALITY_H
// FIXME .. this is the sector of the code
// I am most worried about the directions
// The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome
namespace Grid {
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = adj(lhs._odata[ss]);
}
return ret;
};
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conjugate(lhs._odata[ss]);
}
return ret;
};
}
#endif

View File

@ -0,0 +1,733 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REDUCTION_H
#define GRID_LATTICE_REDUCTION_H
#include <Grid/Grid_Eigen_Dense.h>
namespace Grid {
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
auto nrm = innerProduct(arg,arg);
return std::real(nrm);
}
// Double inner product
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
GridBase *grid = left._grid;
const int pad = 8;
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
}
// All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
inner=0.0;
for(int i=0;i<grid->SumArraySize();i++){
inner = inner+sumarray[i*pad];
}
right._grid->GlobalSum(inner);
return inner;
}
/////////////////////////
// Fast axpby_norm
// z = a x + b y
// return norm z
/////////////////////////
template<class sobj,class vobj> strong_inline RealD
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
sobj one(1.0);
return axpby_norm_fast(z,a,one,x,y);
}
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
const int pad = 8;
z.checkerboard = x.checkerboard;
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x._grid;
Vector<RealD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
// private to thread; sub summation
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
// may be important for correlation functions
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid;
assert(grid!=NULL);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
parallel_for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data._odata[ss];
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
extract(lvSum[rt],extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx];
}
}
// sum over nodes.
sobj gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=zero;
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
// std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
typedef typename vobj::scalar_type scalar_type;
std::vector<scalar_type> lsSum;
localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
// std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
}
template <class vobj>
static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
// std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl;
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type>> extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv;
parallel_for(int r=0;r<rd;r++)
{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
lvSum[r] = lvSum[r] + vv;
}
}
}
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
}
template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
int fd = result.size();
int ld = lsSum.size();
// sum over nodes.
std::vector<scalar_type> gsum;
gsum.resize(fd, scalar_type(0.0));
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum[t]=lsSum[lt];
}
}
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
grid->GlobalSumVector(&gsum[0], fd);
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
result = gsum;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
parallel_for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
lvSum[r]=lvSum[r]+vv;
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// sum over nodes.
scalar_type gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=scalar_type(0.0);
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock);
sliceInnerProductVector(ip,rhs,rhs,Orthog);
for(int ss=0;ss<Nblock;ss++){
sn[ss] = real(ip[ss]);
}
};
template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced tensor_reduced;
scalar_type zscale(scale);
GridBase *grid = X._grid;
int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim];
int fd =grid->_fdimensions[orthogdim];
int ld =grid->_ldimensions[orthogdim];
int rd =grid->_rdimensions[orthogdim];
int e1 =grid->_slice_nblock[orthogdim];
int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim];
std::vector<int> icoor;
for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
vector_type av;
for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd;
scalar_type *as =(scalar_type *)&av;
as[l] = scalar_type(a[ldx])*zscale;
}
tensor_reduced at; at=av;
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
}
}
}
};
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
*/
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
}
}}
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl=1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
}
}}
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
#pragma omp parallel
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs[o+i*ostride];
Right[i] = rhs[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp);
}}
}}
#pragma omp critical
{
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
} /*END NAMESPACE GRID*/
#endif

520
Grid/lattice/Lattice_rng.h Normal file
View File

@ -0,0 +1,520 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_rng.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_RNG_H
#define GRID_LATTICE_RNG_H
#include <random>
#ifdef RNG_SITMO
#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
#endif
#if defined(RNG_SITMO)
#define RNG_FAST_DISCARD
#else
#undef RNG_FAST_DISCARD
#endif
namespace Grid {
//////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid
//////////////////////////////////////////////////////////////
inline int RNGfillable(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1);
}
int multiplicity=1;
for(int d=0;d<lowerdims;d++){
multiplicity=multiplicity*fine->_rdimensions[d];
}
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
}
// merge of April 11 2017
// this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites();
}
// real scalars are one component
template<class scalar,class distribution,class generator>
void fillScalar(scalar &s,distribution &dist,generator & gen)
{
s=dist(gen);
}
template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{
s=ComplexF(dist(gen),dist(gen));
}
template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{
s=ComplexD(dist(gen),dist(gen));
}
class GridRNGbase {
public:
// One generator per site.
// Uniform and Gaussian distributions from these generators.
#ifdef RNG_RANLUX
typedef std::ranlux48 RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 15;
#endif
#ifdef RNG_MT19937
typedef std::mt19937 RngEngine;
typedef uint32_t RngStateType;
static const int RngStateCount = std::mt19937::state_size;
#endif
#ifdef RNG_SITMO
typedef sitmo::prng_engine RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 13;
#endif
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid;
///////////////////////
// support for parallel init
///////////////////////
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
// Consider quenched updating; likely never exceeding rate of 1000 sweeps
// per second on any machine. This gives us of order 10^9 seconds, or 100 years
// skip ahead.
// For HMC unlikely to go at faster than a solve per second, and
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
uint64_t skip = site;
skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
static RngEngine Reseed(RngEngine &eng)
{
std::vector<uint32_t> newseed;
std::uniform_int_distribution<uint32_t> uid;
return Reseed(eng,newseed,uid);
}
static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
std::uniform_int_distribution<uint32_t> &uid)
{
const int reseeds=4;
newseed.resize(reseeds);
for(int i=0;i<reseeds;i++){
newseed[i] = uid(eng);
}
std::seed_seq sseq(newseed.begin(),newseed.end());
return RngEngine(sseq);
}
void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
saved.resize(RngStateCount);
std::stringstream ss;
ss<<eng;
ss.seekg(0,ss.beg);
for(int i=0;i<RngStateCount;i++){
ss>>saved[i];
}
}
void GetState(std::vector<RngStateType> & saved,int gen) {
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
}
ss.seekg(0,ss.beg);
ss>>eng;
}
void SetState(std::vector<RngStateType> & saved,int gen){
SetState(saved,_generators[gen]);
}
void SetEngine(RngEngine &Eng, int gen){
_generators[gen]=Eng;
}
void GetEngine(RngEngine &Eng, int gen){
Eng=_generators[gen];
}
template<class source> void Seed(source &src, int gen)
{
_generators[gen] = RngEngine(src);
}
};
class GridSerialRNG : public GridRNGbase {
public:
GridSerialRNG() : GridRNGbase() {
_generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() );
}
template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
typedef typename sobj::scalar_type scalar_type;
int words = sizeof(sobj)/sizeof(scalar_type);
scalar_type *buf = (scalar_type *) & l;
dist[0].reset();
for(int idx=0;idx<words;idx++){
fillScalar(buf[idx],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
};
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
// vector fill
template <class distribution> inline void fill(vComplexF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vComplexD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<vRealD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
void SeedFixedIntegers(const std::vector<int> &seeds){
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq src(seeds.begin(),seeds.end());
Seed(src,0);
}
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
std::stringstream sha;
seeds = GridChecksum::sha256_seeds(s);
for(int i=0;i<seeds.size();i++) {
sha << std::hex << seeds[i];
}
std::cout << GridLogMessage << "Intialising serial RNG with unique string '"
<< s << "'" << std::endl;
std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
SeedFixedIntegers(seeds);
}
};
class GridParallelRNG : public GridRNGbase {
double _time_counter;
public:
GridBase *_grid;
unsigned int _vol;
int generator_idx(int os,int is) {
return is*_grid->oSites()+os;
}
GridParallelRNG(GridBase *grid) : GridRNGbase() {
_grid = grid;
_vol =_grid->iSites()*_grid->oSites();
_generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
double inner_time_counter = usecond();
int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l._grid too
int osites = _grid->oSites(); // guaranteed to be <= l._grid->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
parallel_for(int ss=0;ss<osites;ss++){
std::vector<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
for (int si = 0; si < Nsimd; si++) {
int gdx = generator_idx(ss, si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
}
// merge into SIMD lanes, FIXME suboptimal implementation
merge(l._odata[sm], buf);
}
}
_time_counter += usecond()- inner_time_counter;
};
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
std::stringstream sha;
seeds = GridChecksum::sha256_seeds(s);
for(int i=0;i<seeds.size();i++) {
sha << std::hex << seeds[i];
}
std::cout << GridLogMessage << "Intialising parallel RNG with unique string '"
<< s << "'" << std::endl;
std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq source(seeds.begin(),seeds.end());
RngEngine master_engine(source);
#ifdef RNG_FAST_DISCARD
////////////////////////////////////////////////
// Skip ahead through a single stream.
// Applicable to SITMO and other has based/crypto RNGs
// Should be applicable to Mersenne Twister, but the C++11
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
// Everybody loops over global volume.
parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
// Where is it?
int rank,o_idx,i_idx;
std::vector<int> gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
}
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient
// and maximally parallel; but NOT reproducible from machine to machine.
// Not ideal, but fastest way to reseed all nodes.
////////////////////////////////////////////////////////////////
{
// Obtain one Reseed per processor
int Nproc = _grid->ProcessorCount();
std::vector<RngEngine> seeders(Nproc);
int me= _grid->ThisRank();
for(int p=0;p<Nproc;p++){
seeders[p] = Reseed(master_engine);
}
master_engine = seeders[me];
}
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);
}
parallel_for(int t=0;t<Nthread;t++) {
// set up one per local site in threaded fashion
std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid;
for(int l=0;l<_grid->lSites();l++) {
if ( (l%Nthread)==t ) {
_generators[l] = Reseed(seeders[t],newseeds,uid);
}
}
}
}
#endif
}
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
uint32_t the_number;
// who
std::vector<int> gcoor;
int rank,o_idx,i_idx;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
}
// share & return
_grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
return the_number;
}
};
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
}
#endif

View File

@ -0,0 +1,67 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_trace.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_TRACE_H
#define GRID_LATTICE_TRACE_H
///////////////////////////////////////////////
// Tracing, transposing, peeking, poking
///////////////////////////////////////////////
namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto trace(const Lattice<vobj> &lhs)
-> Lattice<decltype(trace(lhs._odata[0]))>
{
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = trace(lhs._odata[ss]);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
}
return ret;
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,63 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_transpose.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_TRANSPOSE_H
#define GRID_LATTICE_TRANSPOSE_H
///////////////////////////////////////////////
// Transpose
///////////////////////////////////////////////
namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transpose(lhs._odata[ss]);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
{
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
}
return ret;
};
}
#endif

View File

@ -0,0 +1,84 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_unary.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_UNARY_H
#define GRID_LATTICE_UNARY_H
namespace Grid {
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=pow(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=mod(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=div(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
}
return ret;
}
}
#endif

View File

@ -0,0 +1,86 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_where.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_WHERE_H
#define GRID_LATTICE_WHERE_H
namespace Grid {
// Must implement the predicate gating the
// Must be able to reduce the predicate down to a single vInteger per site.
// Must be able to require the type be iScalar x iScalar x ....
// give a GetVtype method in iScalar
// and blow away the tensor structures.
//
template<class vobj,class iobj>
inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
{
conformable(iftrue,iffalse);
conformable(iftrue,predicate);
conformable(iftrue,ret);
GridBase *grid=iftrue._grid;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename iobj::vector_type mask_type;
const int Nsimd = grid->Nsimd();
std::vector<Integer> mask(Nsimd);
std::vector<scalar_object> truevals (Nsimd);
std::vector<scalar_object> falsevals(Nsimd);
parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
extract(iftrue._odata[ss] ,truevals);
extract(iffalse._odata[ss] ,falsevals);
extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
for(int s=0;s<Nsimd;s++){
if (mask[s]) falsevals[s]=truevals[s];
}
merge(ret._odata[ss],falsevals);
}
}
template<class vobj,class iobj>
inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &iftrue,Lattice<vobj> &iffalse)
{
conformable(iftrue,iffalse);
conformable(iftrue,predicate);
Lattice<vobj> ret(iftrue._grid);
where(ret,predicate,iftrue,iffalse);
return ret;
}
}
#endif

116
Grid/log/Log.cc Normal file
View File

@ -0,0 +1,116 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Log.cc
Copyright (C) 2015
Author: Antonin Portelli <antonin.portelli@me.com>
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/util/CompilerCompatible.h>
#include <cxxabi.h>
#include <memory>
namespace Grid {
std::string demangle(const char* name) {
int status = -4; // some arbitrary value to eliminate the compiler warning
// enable c++11 by passing the flag -std=c++11 to g++
std::unique_ptr<char, void(*)(void*)> res {
abi::__cxa_demangle(name, NULL, NULL, &status),
std::free
};
return (status==0) ? res.get() : name ;
}
GridStopWatch Logger::GlobalStopWatch;
int Logger::timestamp;
std::ostream Logger::devnull(0);
void GridLogTimestamp(int on){
Logger::Timestamp(on);
}
Colours GridLogColours(0);
GridLogger GridLogIRL (1, "IRL" , GridLogColours, "NORMAL");
GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0);
GridLogWarning.Active(0);
GridLogMessage.Active(1); // at least the messages should be always on
GridLogIterative.Active(0);
GridLogDebug.Active(0);
GridLogPerformance.Active(0);
GridLogIntegrator.Active(0);
GridLogColours.Active(0);
for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance"))
GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
}
}
////////////////////////////////////////////////////////////
// Verbose limiter on MPI tasks
////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) {
int me = 0;
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif
#ifdef GRID_COMMS_SHMEM
me = shmem_my_pe();
#endif
if (me) {
std::cout.setstate(std::ios::badbit);
}
}
void Grid_unquiesce_nodes(void) {
#ifdef GRID_COMMS_MPI
std::cout.clear();
#endif
}
}

216
Grid/log/Log.h Normal file
View File

@ -0,0 +1,216 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Log.h
Copyright (C) 2015
Author: Antonin Portelli <antonin.portelli@me.com>
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <map>
#ifndef GRID_LOG_H
#define GRID_LOG_H
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class
//////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{
protected:
bool is_active;
public:
std::map<std::string, std::string> colour;
Colours(bool activate=false){
Active(activate);
};
void Active(bool activate){
is_active=activate;
if (is_active){
colour["BLACK"] ="\033[30m";
colour["RED"] ="\033[31m";
colour["GREEN"] ="\033[32m";
colour["YELLOW"] ="\033[33m";
colour["BLUE"] ="\033[34m";
colour["PURPLE"] ="\033[35m";
colour["CYAN"] ="\033[36m";
colour["WHITE"] ="\033[37m";
colour["NORMAL"] ="\033[0;39m";
} else {
colour["BLACK"] ="";
colour["RED"] ="";
colour["GREEN"] ="";
colour["YELLOW"]="";
colour["BLUE"] ="";
colour["PURPLE"]="";
colour["CYAN"] ="";
colour["WHITE"] ="";
colour["NORMAL"]="";
}
};
};
class Logger {
protected:
Colours &Painter;
int active;
int timing_mode;
int topWidth{-1}, chanWidth{-1};
static int timestamp;
std::string name, topName;
std::string COLOUR;
public:
static GridStopWatch GlobalStopWatch;
GridStopWatch LocalStopWatch;
GridStopWatch *StopWatch;
static std::ostream devnull;
std::string background() {return Painter.colour["NORMAL"];}
std::string evidence() {return Painter.colour["YELLOW"];}
std::string colour() {return Painter.colour[COLOUR];}
Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on),
name(nm),
topName(topNm),
Painter(col_class),
timing_mode(0),
COLOUR(col)
{
StopWatch = & GlobalStopWatch;
};
void Active(int on) {active = on;};
int isActive(void) {return active;};
static void Timestamp(int on) {timestamp = on;};
void Reset(void) {
StopWatch->Reset();
StopWatch->Start();
}
void TimingMode(int on) {
timing_mode = on;
if(on) {
StopWatch = &LocalStopWatch;
Reset();
}
}
void setTopWidth(const int w) {topWidth = w;}
void setChanWidth(const int w) {chanWidth = w;}
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) {
stream << log.background()<< std::left;
if (log.topWidth > 0)
{
stream << std::setw(log.topWidth);
}
stream << log.topName << log.background()<< " : ";
stream << log.colour() << std::left;
if (log.chanWidth > 0)
{
stream << std::setw(log.chanWidth);
}
stream << log.name << log.background() << " : ";
if ( log.timestamp ) {
log.StopWatch->Stop();
GridTime now = log.StopWatch->Elapsed();
if ( log.timing_mode==1 ) log.StopWatch->Reset();
log.StopWatch->Start();
stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
}
stream << log.colour();
return stream;
} else {
return devnull;
}
}
};
class GridLogger: public Logger {
public:
GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
Logger("Grid", on, nm, col_class, col_key){};
};
void GridLogConfigure(std::vector<std::string> &logstreams);
extern GridLogger GridLogIRL;
extern GridLogger GridLogSolver;
extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern Colours GridLogColours;
std::string demangle(const char* name) ;
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() {\
char string[20]; \
std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp)\
std::fclose(fp); \
}
#ifdef HAVE_EXECINFO_H
#define BACKTRACEFP(fp) { \
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);\
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
for (int i = 0; i < symbols; i++){\
std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \
}\
}
#else
#define BACKTRACEFP(fp) { \
std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
}
#endif
#define BACKTRACE() BACKTRACEFP(stdout)
}
#endif

729
Grid/parallelIO/BinaryIO.h Normal file
View File

@ -0,0 +1,729 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/BinaryIO.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu<guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO
#else
#undef USE_MPI_IO
#endif
#ifdef HAVE_ENDIAN_H
#include <endian.h>
#endif
#include <arpa/inet.h>
#include <algorithm>
namespace Grid {
/////////////////////////////////////////////////////////////////////////////////
// Byte reversal garbage
/////////////////////////////////////////////////////////////////////////////////
inline uint32_t byte_reverse32(uint32_t f) {
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
return f;
}
inline uint64_t byte_reverse64(uint64_t f) {
uint64_t g;
g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
g = g << 32;
f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
return g;
}
#if BYTE_ORDER == BIG_ENDIAN
inline uint64_t Grid_ntohll(uint64_t A) { return A; }
#else
inline uint64_t Grid_ntohll(uint64_t A) {
return byte_reverse64(A);
}
#endif
// A little helper
inline void removeWhitespace(std::string &key)
{
key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Static class holding the parallel IO code
// Could just use a namespace
///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO {
public:
/////////////////////////////////////////////////////////////////////////////
// more byte manipulation helpers
/////////////////////////////////////////////////////////////////////////////
template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,uint32_t &nersc_csum)
{
typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid;
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
unvectorizeToLexOrdArray(scalardata,lat);
NerscChecksum(grid,scalardata,nersc_csum);
}
template <class fobj>
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
{
const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
uint64_t lsites = grid->lSites();
if (fbuf.size() == 1)
{
lsites = 1;
}
PARALLEL_REGION
{
uint32_t nersc_csum_thr = 0;
PARALLEL_FOR_LOOP_INTERN
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++)
{
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
}
PARALLEL_CRITICAL
{
nersc_csum += nersc_csum_thr;
}
}
}
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
int nd = grid->_ndimension;
uint64_t lsites =grid->lSites();
if (fbuf.size()==1) {
lsites=1;
}
std::vector<int> local_vol =grid->LocalDimensions();
std::vector<int> local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions();
PARALLEL_REGION
{
std::vector<int> coor(nd);
uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0;
PARALLEL_FOR_LOOP_INTERN
for(uint64_t local_site=0;local_site<lsites;local_site++){
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
/*
* Scidac csum is rather more heavyweight
* FIXME -- 128^3 x 256 x 16 will overflow.
*/
int global_site;
Lexicographic::CoorFromIndex(coor,local_site,local_vol);
for(int d=0;d<nd;d++) {
coor[d] = coor[d]+local_start[d];
}
Lexicographic::IndexFromCoor(coor,global_site,global_vol);
uint32_t gsite29 = global_site%29;
uint32_t gsite31 = global_site%31;
site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
// std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
// std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
}
PARALLEL_CRITICAL
{
scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr;
}
}
}
// Network is big endian
static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);}
static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);}
static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);}
static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);}
static inline void be32toh_v(void *file_object,uint64_t bytes)
{
uint32_t * f = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
f[i] = ntohl(f[i]);
}
}
// LE must Swap and switch to host
static inline void le32toh_v(void *file_object,uint64_t bytes)
{
uint32_t *fp = (uint32_t *)file_object;
uint32_t f;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
f = fp[i];
// got network order and the network to host
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = ntohl(f);
}
}
// BE is same as network
static inline void be64toh_v(void *file_object,uint64_t bytes)
{
uint64_t * f = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
f[i] = Grid_ntohll(f[i]);
}
}
// LE must swap and switch;
static inline void le64toh_v(void *file_object,uint64_t bytes)
{
uint64_t *fp = (uint64_t *)file_object;
uint64_t f,g;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
f = fp[i];
// got network order and the network to host
g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
g = g << 32;
f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = Grid_ntohll(g);
}
}
/////////////////////////////////////////////////////////////////////////////
// Real action:
// Read or Write distributed lexico array of ANY object to a specific location in file
//////////////////////////////////////////////////////////////////////////////////////
static const int BINARYIO_MASTER_APPEND = 0x10;
static const int BINARYIO_UNORDERED = 0x08;
static const int BINARYIO_LEXICOGRAPHIC = 0x04;
static const int BINARYIO_READ = 0x02;
static const int BINARYIO_WRITE = 0x01;
template<class word,class fobj>
static inline void IOobject(word w,
GridBase *grid,
std::vector<fobj> &iodata,
std::string file,
uint64_t& offset,
const std::string &format, int control,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
grid->Barrier();
GridStopWatch timer;
GridStopWatch bstimer;
nersc_csum=0;
scidac_csuma=0;
scidac_csumb=0;
int ndim = grid->Dimensions();
int nrank = grid->ProcessorCount();
int myrank = grid->ThisRank();
std::vector<int> psizes = grid->ProcessorGrid();
std::vector<int> pcoor = grid->ThisProcessorCoor();
std::vector<int> gLattice= grid->GlobalDimensions();
std::vector<int> lLattice= grid->LocalDimensions();
std::vector<int> lStart(ndim);
std::vector<int> gStart(ndim);
// Flatten the file
uint64_t lsites = grid->lSites();
if ( control & BINARYIO_MASTER_APPEND ) {
assert(iodata.size()==1);
} else {
assert(lsites==iodata.size());
}
for(int d=0;d<ndim;d++){
gStart[d] = lLattice[d]*pcoor[d];
lStart[d] = 0;
}
#ifdef USE_MPI_IO
std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
std::vector<int> dargs (ndim,MPI_DISTRIBUTE_DFLT_DARG);
MPI_Datatype mpiObject;
MPI_Datatype fileArray;
MPI_Datatype localArray;
MPI_Datatype mpiword;
MPI_Offset disp = offset;
MPI_File fh ;
MPI_Status status;
int numword;
if ( sizeof( word ) == sizeof(float ) ) {
numword = sizeof(fobj)/sizeof(float);
mpiword = MPI_FLOAT;
} else {
numword = sizeof(fobj)/sizeof(double);
mpiword = MPI_DOUBLE;
}
//////////////////////////////////////////////////////////////////////////////
// Sobj in MPI phrasing
//////////////////////////////////////////////////////////////////////////////
int ierr;
ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); assert(ierr==0);
ierr = MPI_Type_commit(&mpiObject);
//////////////////////////////////////////////////////////////////////////////
// File global array data type
//////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); assert(ierr==0);
ierr=MPI_Type_commit(&fileArray); assert(ierr==0);
//////////////////////////////////////////////////////////////////////////////
// local lattice array
//////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); assert(ierr==0);
ierr=MPI_Type_commit(&localArray); assert(ierr==0);
#endif
//////////////////////////////////////////////////////////////////////////////
// Byte order
//////////////////////////////////////////////////////////////////////////////
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
//////////////////////////////////////////////////////////////////////////////
// Do the I/O
//////////////////////////////////////////////////////////////////////////////
if ( control & BINARYIO_READ ) {
timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
#else
assert(0);
#endif
} else {
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND)
{
fin.seekg(-sizeof(fobj), fin.end);
}
else
{
fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
fin.close();
}
timer.Stop();
grid->Barrier();
bstimer.Start();
ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
NerscChecksum(grid,iodata,nersc_csum);
bstimer.Stop();
}
if ( control & BINARYIO_WRITE ) {
bstimer.Start();
NerscChecksum(grid,iodata,nersc_csum);
if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
bstimer.Stop();
grid->Barrier();
timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
// std::cout << GridLogMessage << "Checking for errors" << std::endl;
if (ierr != MPI_SUCCESS)
{
char error_string[BUFSIZ];
int length_of_error_string, error_class;
MPI_Error_class(ierr, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Error_string(ierr, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_Offset os;
MPI_File_get_position(fh, &os);
MPI_File_get_byte_offset(fh, os, &disp);
offset = disp;
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
#else
assert(0);
#endif
} else {
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
std::ofstream fout;
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
try {
if (offset) { // Must already exist and contain data
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} else { // Allow create
fout.open(file,std::ios::binary|std::ios::out);
}
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
// std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
if ( control & BINARYIO_MASTER_APPEND ) {
try {
fout.seekp(0,fout.end);
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file end " << file << std::endl;
}
} else {
try {
fout.seekp(offset+myrank*lsites*sizeof(fobj));
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
}
}
try {
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
}
catch (const std::fstream::failure& exc) {
std::cout << "Exception in writing file " << file << std::endl;
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
offset = fout.tellp();
fout.close();
}
timer.Stop();
}
std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
//////////////////////////////////////////////////////////////////////////////
// Safety check
//////////////////////////////////////////////////////////////////////////////
// if the data size is 1 we do not want to sum over the MPI ranks
if (iodata.size() != 1){
grid->Barrier();
grid->GlobalSum(nersc_csum);
grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
}
}
/////////////////////////////////////////////////////////////////////////////
// Read a Lattice of object
//////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class fobj,class munger>
static inline void readLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
GridStopWatch timer;
timer.Start();
parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier();
timer.Stop();
std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed() <<std::endl;
}
/////////////////////////////////////////////////////////////////////////////
// Write a Lattice of object
//////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class fobj,class munger>
static inline void writeLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
//////////////////////////////////////////////////////////////////////////////
// Munge [ .e.g 3rd row recon ]
//////////////////////////////////////////////////////////////////////////////
GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
grid->Barrier();
timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
}
/////////////////////////////////////////////////////////////////////////////
// Read a RNG; use IOobject and lexico map to an array of state
//////////////////////////////////////////////////////////////////////////////////////
static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename GridSerialRNG::RngStateType RngStateType;
const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate;
typedef RngStateType word; word w=0;
std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid;
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp = 0;
uint32_t scidac_csumb_tmp = 0;
GridStopWatch timer;
std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
std::vector<RNGstate> iodata(lsites);
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
timer.Start();
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx);
}
timer.Stop();
iodata.resize(1);
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND,
nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
{
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
serial.SetState(tmp,0);
}
nersc_csum = nersc_csum + nersc_csum_tmp;
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
std::cout << GridLogMessage << "RNG file nersc_checksum " << std::hex << nersc_csum << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
}
/////////////////////////////////////////////////////////////////////////////
// Write a RNG; lexico map to an array of state and use IOobject
//////////////////////////////////////////////////////////////////////////////////////
static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename GridSerialRNG::RngStateType RngStateType;
typedef RngStateType word; word w=0;
const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid;
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
uint32_t scidac_csumb_tmp;
GridStopWatch timer;
std::string format = "IEEE32BIG";
std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl;
timer.Start();
std::vector<RNGstate> iodata(lsites);
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}
timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
iodata.resize(1);
{
std::vector<RngStateType> tmp(RngStateCount);
serial.GetState(tmp,0);
std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
}
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
nersc_csum = nersc_csum + nersc_csum_tmp;
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
}
};
}
#endif

868
Grid/parallelIO/IldgIO.h Normal file
View File

@ -0,0 +1,868 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/IldgIO.h
Copyright (C) 2015
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ILDG_IO_H
#define GRID_ILDG_IO_H
#ifdef HAVE_LIME
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <pwd.h>
#include <sys/utsname.h>
#include <unistd.h>
//C-Lime is a must have for this functionality
extern "C" {
#include "lime.h"
}
namespace Grid {
namespace QCD {
/////////////////////////////////
// Encode word types as strings
/////////////////////////////////
template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
template<> inline std::string ScidacWordMnemonic<double> (void){ return std::string("D"); }
template<> inline std::string ScidacWordMnemonic<float> (void){ return std::string("F"); }
template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
/////////////////////////////////////////
// Encode a generic tensor as a string
/////////////////////////////////////////
template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) {
typedef typename getPrecision<vobj>::real_scalar_type stype;
int _ColourN = indexRank<ColourIndex,vobj>();
int _ColourScalar = isScalar<ColourIndex,vobj>();
int _ColourVector = isVector<ColourIndex,vobj>();
int _ColourMatrix = isMatrix<ColourIndex,vobj>();
int _SpinN = indexRank<SpinIndex,vobj>();
int _SpinScalar = isScalar<SpinIndex,vobj>();
int _SpinVector = isVector<SpinIndex,vobj>();
int _SpinMatrix = isMatrix<SpinIndex,vobj>();
int _LorentzN = indexRank<LorentzIndex,vobj>();
int _LorentzScalar = isScalar<LorentzIndex,vobj>();
int _LorentzVector = isVector<LorentzIndex,vobj>();
int _LorentzMatrix = isMatrix<LorentzIndex,vobj>();
std::stringstream stream;
stream << "GRID_";
stream << ScidacWordMnemonic<stype>();
if ( _LorentzVector ) stream << "_LorentzVector"<<_LorentzN;
if ( _LorentzMatrix ) stream << "_LorentzMatrix"<<_LorentzN;
if ( _SpinVector ) stream << "_SpinVector"<<_SpinN;
if ( _SpinMatrix ) stream << "_SpinMatrix"<<_SpinN;
if ( _ColourVector ) stream << "_ColourVector"<<_ColourN;
if ( _ColourMatrix ) stream << "_ColourMatrix"<<_ColourN;
if ( _ColourScalar && _LorentzScalar && _SpinScalar ) stream << "_Complex";
typesize = sizeof(typename vobj::scalar_type);
if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
else typesize*= _ColourN;
if ( _SpinMatrix ) typesize*= _SpinN*_SpinN;
else typesize*= _SpinN;
colors = _ColourN;
spins = _SpinN;
datacount = _LorentzN;
return stream.str();
}
template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) {
return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
};
////////////////////////////////////////////////////////////
// Helper to fill out metadata
////////////////////////////////////////////////////////////
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
FieldMetaData &header,
scidacRecord & _scidacRecord,
scidacFile & _scidacFile)
{
typedef typename getPrecision<vobj>::real_scalar_type stype;
/////////////////////////////////////
// Pull Grid's metadata
/////////////////////////////////////
PrepareMetaData(field,header);
/////////////////////////////////////
// Scidac Private File structure
/////////////////////////////////////
_scidacFile = scidacFile(field._grid);
/////////////////////////////////////
// Scidac Private Record structure
/////////////////////////////////////
scidacRecord sr;
sr.datatype = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
sr.date = header.creation_date;
sr.precision = ScidacWordMnemonic<stype>();
sr.recordtype = GRID_IO_FIELD;
_scidacRecord = sr;
// std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
}
///////////////////////////////////////////////////////
// Scidac checksum
///////////////////////////////////////////////////////
static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
{
uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
if ( scidac_csuma !=scidac_checksuma) return 0;
if ( scidac_csumb !=scidac_checksumb) return 0;
return 1;
}
////////////////////////////////////////////////////////////////////////////////////
// Lime, ILDG and Scidac I/O classes
////////////////////////////////////////////////////////////////////////////////////
class GridLimeReader : public BinaryIO {
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
///////////////////////////////////////////////////
FILE *File;
LimeReader *LimeR;
std::string filename;
/////////////////////////////////////////////
// Open the file
/////////////////////////////////////////////
void open(const std::string &_filename)
{
filename= _filename;
File = fopen(filename.c_str(), "r");
if (File == nullptr)
{
std::cerr << "cannot open file '" << filename << "'" << std::endl;
abort();
}
LimeR = limeCreateReader(File);
}
/////////////////////////////////////////////
// Close the file
/////////////////////////////////////////////
void close(void){
fclose(File);
// limeDestroyReader(LimeR);
}
////////////////////////////////////////////
// Read a generic lattice field and verify checksum
////////////////////////////////////////////
template<class vobj>
void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{
typedef typename vobj::scalar_object sobj;
scidacChecksum scidacChecksum_;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
std::string format = getFormatString<vobj>();
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
uint64_t file_bytes =limeReaderBytes(LimeR);
// std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl;
assert(PayloadSize == file_bytes);// Must match or user error
uint64_t offset= ftello(File);
// std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
/////////////////////////////////////////////
// Insist checksum is next record
/////////////////////////////////////////////
readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
/////////////////////////////////////////////
// Verify checksums
/////////////////////////////////////////////
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
return;
}
}
}
////////////////////////////////////////////
// Read a generic serialisable object
////////////////////////////////////////////
template<class serialisable_object>
void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
{
// should this be a do while; can we miss a first record??
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
// std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
std::string xmlstring(&xmlc[0]);
XmlReader RD(xmlstring, true, "");
read(RD,object_name,object);
return;
}
}
assert(0);
}
};
class GridLimeWriter : public BinaryIO
{
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
// FIXME: collective calls or not ?
// : must know if I am the I/O boss
///////////////////////////////////////////////////
FILE *File;
LimeWriter *LimeW;
std::string filename;
bool boss_node;
GridLimeWriter( bool isboss = true) {
boss_node = isboss;
}
void open(const std::string &_filename) {
filename= _filename;
if ( boss_node ) {
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
}
}
/////////////////////////////////////////////
// Close the file
/////////////////////////////////////////////
void close(void) {
if ( boss_node ) {
fclose(File);
}
// limeDestroyWriter(LimeW);
}
///////////////////////////////////////////////////////
// Lime utility functions
///////////////////////////////////////////////////////
int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
{
if ( boss_node ) {
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
}
return LIME_SUCCESS;
}
////////////////////////////////////////////
// Write a generic serialisable object
////////////////////////////////////////////
void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
{
if ( boss_node ) {
std::string xmlstring = writer.docString();
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
}
}
template<class serialisable_object>
void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
{
XmlWriter WR("","");
if (scientificPrec)
{
WR.scientificFormat(true);
WR.setPrecision(scientificPrec);
}
write(WR,object_name,object);
writeLimeObject(MB, ME, WR, object_name, record_name);
}
////////////////////////////////////////////////////
// Write a generic lattice field and csum
// This routine is Collectively called by all nodes
// in communicator used by the field._grid
////////////////////////////////////////////////////
template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{
////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the
// the same file through different file handles (integer units).
//
// These are both buffered, so why I think this code is right is as follows.
//
// i) write record header to FILE *File, telegraphing the size; flush
// ii) ftello reads the offset from FILE *File .
// iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
// Closes iostream and flushes.
// iv) fseek on FILE * to end of this disjoint section.
// v) Continue writing scidac record.
////////////////////////////////////////////////////////////////////
GridBase *grid = field._grid;
assert(boss_node == field._grid->IsBoss() );
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
if ( boss_node ) {
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
fflush(File);
}
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////
// Check all nodes agree on file position
////////////////////////////////////////////////
uint64_t offset1;
if ( boss_node ) {
offset1 = ftello(File);
}
grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
///////////////////////////////////////////
// The above is collective. Write by other means into the binary record
///////////////////////////////////////////
std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
///////////////////////////////////////////
// Wind forward and close the record
///////////////////////////////////////////
if ( boss_node ) {
fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
assert( (offset2-offset1) == PayloadSize);
}
/////////////////////////////////////////////////////////////
// Check MPI-2 I/O did what we expect to file
/////////////////////////////////////////////////////////////
if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); assert(err>=0);
}
////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO
// Always pair a checksum with a binary object, and close message
////////////////////////////////////////
scidacChecksum checksum;
std::stringstream streama; streama << std::hex << scidac_csuma;
std::stringstream streamb; streamb << std::hex << scidac_csumb;
checksum.suma= streama.str();
checksum.sumb= streamb.str();
if ( boss_node ) {
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
}
}
};
class ScidacWriter : public GridLimeWriter {
public:
ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss) { };
template<class SerialisableUserFile>
void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
if ( this->boss_node ) {
writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0)
{
GridBase * grid = field._grid;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
ScidacMetaData(field,header,_scidacRecord,_scidacFile);
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
if ( this->boss_node ) {
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
}
// Collective call
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
}
};
class ScidacReader : public GridLimeReader {
public:
template<class SerialisableUserFile>
void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{
typedef typename vobj::scalar_object sobj;
GridBase * grid = field._grid;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
}
void skipPastBinaryRecord(void) {
std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return;
}
}
}
void skipPastObjectRecord(std::string rec_name) {
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
return;
}
}
}
void skipScidacFieldRecord() {
skipPastObjectRecord(std::string(GRID_FORMAT));
skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
skipPastBinaryRecord();
}
};
class IldgWriter : public ScidacWriter {
public:
IldgWriter(bool isboss) : ScidacWriter(isboss) {};
///////////////////////////////////
// A little helper
///////////////////////////////////
void writeLimeIldgLFN(std::string &LFN)
{
uint64_t PayloadSize = LFN.size();
int err;
createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
}
////////////////////////////////////////////////////////////////
// Special ILDG operations ; gauge configs only.
// Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present.
////////////////////////////////////////////////////////////////
template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{
GridBase * grid = Umu._grid;
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
std::string format = header.floating_point;
header.ensemble_id = description;
header.ensemble_label = description;
header.sequence_number = sequence;
header.ildg_lfn = LFN;
assert ( (format == std::string("IEEE32BIG"))
||(format == std::string("IEEE64BIG")) );
//////////////////////////////////////////////////////
// Fill ILDG header data struct
//////////////////////////////////////////////////////
ildgFormat ildgfmt ;
ildgfmt.field = std::string("su3gauge");
if ( format == std::string("IEEE32BIG") ) {
ildgfmt.precision = 32;
} else {
ildgfmt.precision = 64;
}
ildgfmt.version = 1.0;
ildgfmt.lx = header.dimension[0];
ildgfmt.ly = header.dimension[1];
ildgfmt.lz = header.dimension[2];
ildgfmt.lt = header.dimension[3];
assert(header.nd==4);
assert(header.nd==header.dimension.size());
//////////////////////////////////////////////////////////////////////////////
// Fill the USQCD info field
//////////////////////////////////////////////////////////////////////////////
usqcdInfo info;
info.version=1.0;
info.plaq = header.plaquette;
info.linktr = header.link_trace;
std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
writeLimeObject(0,0,ildgfmt,std::string("ildgFormat") ,std::string(ILDG_FORMAT)); // rec
writeLimeIldgLFN(header.ildg_lfn); // rec
writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
// limeDestroyWriter(LimeW);
}
};
class IldgReader : public GridLimeReader {
public:
////////////////////////////////////////////////////////////////
// Read either Grid/SciDAC/ILDG configuration
// Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present.
// Else use ILDG MetaData object if present.
// Else use SciDAC MetaData object if present.
////////////////////////////////////////////////////////////////
template <class vsimd>
void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef typename GaugeField::vector_object vobj;
typedef typename vobj::scalar_object sobj;
typedef LorentzColourMatrixF fobj;
typedef LorentzColourMatrixD dobj;
GridBase *grid = Umu._grid;
std::vector<int> dims = Umu._grid->FullDimensions();
assert(dims.size()==4);
// Metadata holders
ildgFormat ildgFormat_ ;
std::string ildgLFN_ ;
scidacChecksum scidacChecksum_;
usqcdInfo usqcdInfo_ ;
// track what we read from file
int found_ildgFormat =0;
int found_ildgLFN =0;
int found_scidacChecksum=0;
int found_usqcdInfo =0;
int found_ildgBinary =0;
int found_FieldMetaData =0;
uint32_t nersc_csum;
uint32_t scidac_csuma;
uint32_t scidac_csumb;
// Binary format
std::string format;
//////////////////////////////////////////////////////////////////////////
// Loop over all records
// -- Order is poorly guaranteed except ILDG header preceeds binary section.
// -- Run like an event loop.
// -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
// that Scidac.
// -- Insist on Scidac checksum record.
//////////////////////////////////////////////////////////////////////////
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
//////////////////////////////////////////////////////////////////
// If not BINARY_DATA read a string and parse
//////////////////////////////////////////////////////////////////
if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) ) ) {
// Copy out the string
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
// std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
//////////////////////////////////
// ILDG format record
std::string xmlstring(&xmlc[0]);
if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,"ildgFormat",ildgFormat_);
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
assert( ildgFormat_.lx == dims[0]);
assert( ildgFormat_.ly == dims[1]);
assert( ildgFormat_.lz == dims[2]);
assert( ildgFormat_.lt == dims[3]);
found_ildgFormat = 1;
}
if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
FieldMetaData_.ildg_lfn = xmlstring;
found_ildgLFN = 1;
}
if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,"FieldMetaData",FieldMetaData_);
format = FieldMetaData_.floating_point;
assert(FieldMetaData_.dimension[0] == dims[0]);
assert(FieldMetaData_.dimension[1] == dims[1]);
assert(FieldMetaData_.dimension[2] == dims[2]);
assert(FieldMetaData_.dimension[3] == dims[3]);
found_FieldMetaData = 1;
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) {
// is it a USQCD info field
if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) {
// std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
XmlReader RD(xmlstring, true, "");
read(RD,"usqcdInfo",usqcdInfo_);
found_usqcdInfo = 1;
}
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,"scidacChecksum",scidacChecksum_);
found_scidacChecksum = 1;
}
} else {
/////////////////////////////////
// Binary data
/////////////////////////////////
std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl;
uint64_t offset= ftello(File);
if ( format == std::string("IEEE64BIG") ) {
GaugeSimpleMunger<dobj, sobj> munge;
BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
} else {
GaugeSimpleMunger<fobj, sobj> munge;
BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
}
found_ildgBinary = 1;
}
}
//////////////////////////////////////////////////////
// Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format
//////////////////////////////////////////////////////
assert(found_ildgBinary);
assert(found_ildgFormat);
assert(found_scidacChecksum);
// Must find something with the lattice dimensions
assert(found_FieldMetaData||found_ildgFormat);
if ( found_FieldMetaData ) {
std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
} else {
assert(found_ildgFormat);
assert ( ildgFormat_.field == std::string("su3gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
///////////////////////////////////////////////////////////////////////////////////////
std::ostringstream vers; vers << ildgFormat_.version;
FieldMetaData_.hdr_version = vers.str();
FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
FieldMetaData_.nd=4;
FieldMetaData_.dimension.resize(4);
FieldMetaData_.dimension[0] = ildgFormat_.lx ;
FieldMetaData_.dimension[1] = ildgFormat_.ly ;
FieldMetaData_.dimension[2] = ildgFormat_.lz ;
FieldMetaData_.dimension[3] = ildgFormat_.lt ;
if ( found_usqcdInfo ) {
FieldMetaData_.plaquette = usqcdInfo_.plaq;
FieldMetaData_.link_trace= usqcdInfo_.linktr;
std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
} else {
FieldMetaData_.plaquette = 0.0;
FieldMetaData_.link_trace= 0.0;
std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
}
}
////////////////////////////////////////////////////////////
// Really really want to mandate a scidac checksum
////////////////////////////////////////////////////////////
if ( found_scidacChecksum ) {
FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
} else {
std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
assert(0); // Can I insist always checksum ?
}
if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker;
GaugeStatistics(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
}
}
};
}}
//HAVE_LIME
#endif
#endif

View File

@ -0,0 +1,237 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/IldgIO.h
Copyright (C) 2015
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ILDGTYPES_IO_H
#define GRID_ILDGTYPES_IO_H
#ifdef HAVE_LIME
extern "C" { // for linkage
#include "lime.h"
}
namespace Grid {
/////////////////////////////////////////////////////////////////////////////////
// Data representation of records that enter ILDG and SciDac formats
/////////////////////////////////////////////////////////////////////////////////
#define GRID_FORMAT "grid-format"
#define ILDG_FORMAT "ildg-format"
#define ILDG_BINARY_DATA "ildg-binary-data"
#define ILDG_DATA_LFN "ildg-data-lfn"
#define SCIDAC_CHECKSUM "scidac-checksum"
#define SCIDAC_PRIVATE_FILE_XML "scidac-private-file-xml"
#define SCIDAC_FILE_XML "scidac-file-xml"
#define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
#define SCIDAC_RECORD_XML "scidac-record-xml"
#define SCIDAC_BINARY_DATA "scidac-binary-data"
// Unused SCIDAC records names; could move to support this functionality
#define SCIDAC_SITELIST "scidac-sitelist"
////////////////////////////////////////////////////////////
const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat
const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat
const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat
////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////
// QIO uses mandatory "private" records fixed format
// Private is in principle "opaque" however it can't be changed now because that would break existing
// file compatability, so should be correct to assume the undocumented but defacto file structure.
/////////////////////////////////////////////////////////////////////////////////
struct emptyUserRecord : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
emptyUserRecord() { dummy=0; };
};
////////////////////////
// Scidac private file xml
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
////////////////////////
struct scidacFile : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
double, version,
int, spacetime,
std::string, dims, // must convert to int
int, volfmt);
std::vector<int> getDimensions(void) {
std::stringstream stream(dims);
std::vector<int> dimensions;
int n;
while(stream >> n){
dimensions.push_back(n);
}
return dimensions;
}
void setDimensions(std::vector<int> dimensions) {
char delimiter = ' ';
std::stringstream stream;
for(int i=0;i<dimensions.size();i++){
stream << dimensions[i];
if ( i != dimensions.size()-1) {
stream << delimiter <<std::endl;
}
}
dims = stream.str();
}
// Constructor provides Grid
scidacFile() =default; // default constructor
scidacFile(GridBase * grid){
version = 1.0;
spacetime = grid->_ndimension;
setDimensions(grid->FullDimensions());
volfmt = GRID_IO_SINGLEFILE;
}
};
///////////////////////////////////////////////////////////////////////
// scidac-private-record-xml : example
// <scidacRecord>
// <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
// <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
// <typesize>144</typesize><datacount>4</datacount>
// </scidacRecord>
///////////////////////////////////////////////////////////////////////
struct scidacRecord : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
double, version,
std::string, date,
int, recordtype,
std::string, datatype,
std::string, precision,
int, colors,
int, spins,
int, typesize,
int, datacount);
scidacRecord()
: version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
{}
};
////////////////////////
// ILDG format
////////////////////////
struct ildgFormat : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
double, version,
std::string, field,
int, precision,
int, lx,
int, ly,
int, lz,
int, lt);
ildgFormat() { version=1.0; };
};
////////////////////////
// USQCD info
////////////////////////
struct usqcdInfo : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
double, version,
double, plaq,
double, linktr,
std::string, info);
usqcdInfo() {
version=1.0;
};
};
////////////////////////
// Scidac Checksum
////////////////////////
struct scidacChecksum : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
double, version,
std::string, suma,
std::string, sumb);
scidacChecksum() {
version=1.0;
};
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Type: scidac-file-xml <title>MILC ILDG archival gauge configuration</title>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Type:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////
// Scidac private file xml
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
////////////////////////
#if 0
////////////////////////////////////////////////////////////////////////////////////////
// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
////////////////////////////////////////////////////////////////////////////////////////
struct usqcdPropFile : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
double, version,
std::string, type,
std::string, info);
usqcdPropFile() {
version=1.0;
};
};
struct usqcdSourceInfo : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
double, version,
std::string, info);
usqcdSourceInfo() {
version=1.0;
};
};
struct usqcdPropInfo : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
double, version,
int, spin,
int, color,
std::string, info);
usqcdPropInfo() {
version=1.0;
};
};
#endif
}
#endif
#endif

327
Grid/parallelIO/MetaData.h Normal file
View File

@ -0,0 +1,327 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/NerscIO.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <map>
#include <unistd.h>
#include <sys/utsname.h>
#include <pwd.h>
namespace Grid {
///////////////////////////////////////////////////////
// Precision mapping
///////////////////////////////////////////////////////
template<class vobj> static std::string getFormatString (void)
{
std::string format;
typedef typename getPrecision<vobj>::real_scalar_type stype;
if ( sizeof(stype) == sizeof(float) ) {
format = std::string("IEEE32BIG");
}
if ( sizeof(stype) == sizeof(double) ) {
format = std::string("IEEE64BIG");
}
return format;
}
////////////////////////////////////////////////////////////////////////////////
// header specification/interpretation
////////////////////////////////////////////////////////////////////////////////
class FieldMetaData : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
int, nd,
std::vector<int>, dimension,
std::vector<std::string>, boundary,
int, data_start,
std::string, hdr_version,
std::string, storage_format,
double, link_trace,
double, plaquette,
uint32_t, checksum,
uint32_t, scidac_checksuma,
uint32_t, scidac_checksumb,
unsigned int, sequence_number,
std::string, data_type,
std::string, ensemble_id,
std::string, ensemble_label,
std::string, ildg_lfn,
std::string, creator,
std::string, creator_hardware,
std::string, creation_date,
std::string, archive_date,
std::string, floating_point);
// WARNING: non-initialised values might lead to twisted parallel IO
// issues, std::string are fine because they initliase to size 0
// as per C++ standard.
FieldMetaData(void)
: nd(4), dimension(4,0), boundary(4, ""), data_start(0),
link_trace(0.), plaquette(0.), checksum(0),
scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
{}
};
namespace QCD {
using namespace Grid;
//////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data
//////////////////////////////////////////////////////////////////////
inline void GridMetaData(GridBase *grid,FieldMetaData &header)
{
int nd = grid->_ndimension;
header.nd = nd;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}
for(int d=0;d<nd;d++) {
header.boundary[d] = std::string("PERIODIC");
}
}
inline void MachineCharacteristics(FieldMetaData &header)
{
// Who
struct passwd *pw = getpwuid (getuid());
if (pw) header.creator = std::string(pw->pw_name);
// When
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str();
header.archive_date = header.creation_date;
// What
struct utsname name; uname(&name);
header.creator_hardware = std::string(name.nodename)+"-";
header.creator_hardware+= std::string(name.machine)+"-";
header.creator_hardware+= std::string(name.sysname)+"-";
header.creator_hardware+= std::string(name.release);
}
#define dump_meta_data(field, s) \
s << "BEGIN_HEADER" << std::endl; \
s << "HDR_VERSION = " << field.hdr_version << std::endl; \
s << "DATATYPE = " << field.data_type << std::endl; \
s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \
for(int i=0;i<4;i++){ \
s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
} \
s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \
for(int i=0;i<4;i++){ \
s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl; \
} \
\
s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl; \
s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl; \
s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl; \
s << "CREATOR = " << field.creator << std::endl; \
s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl; \
s << "CREATION_DATE = " << field.creation_date << std::endl; \
s << "ARCHIVE_DATE = " << field.archive_date << std::endl; \
s << "FLOATING_POINT = " << field.floating_point << std::endl; \
s << "END_HEADER" << std::endl;
template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vobj>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
MachineCharacteristics(header);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
//////////////////////////////////////////////////////////////////////
// Utilities ; these are QCD aware
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
/////////////////////////////////////////////////////////////////////////////////
// Simple classes for precision conversion
/////////////////////////////////////////////////////////////////////////////////
template <class fobj, class sobj>
struct BinarySimpleUnmunger {
typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
void operator()(sobj &in, fobj &out) {
// take word by word and transform accoding to the status
fobj_stype *out_buffer = (fobj_stype *)&out;
sobj_stype *in_buffer = (sobj_stype *)&in;
size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
}
};
template <class fobj, class sobj>
struct BinarySimpleMunger {
typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
void operator()(fobj &in, sobj &out) {
// take word by word and transform accoding to the status
fobj_stype *in_buffer = (fobj_stype *)&in;
sobj_stype *out_buffer = (sobj_stype *)&out;
size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
}
};
template<class fobj,class sobj>
struct GaugeSimpleMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
reconstruct3(out);
}
};
template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}
}
};
}
}

363
Grid/parallelIO/NerscIO.h Normal file
View File

@ -0,0 +1,363 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/NerscIO.h
Copyright (C) 2015
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
namespace Grid {
namespace QCD {
using namespace Grid;
////////////////////////////////////////////////////////////////////////////////
// Write and read from fstream; comput header offset for payload
////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO {
public:
static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out);
}
static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
{
std::ofstream fout(file,std::ios::out|std::ios::in);
fout.seekp(0,std::ios::beg);
dump_meta_data(field, fout);
field.data_start = fout.tellp();
return field.data_start;
}
// for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{
uint64_t offset=0;
std::map<std::string,std::string> header;
std::string line;
//////////////////////////////////////////////////
// read the header
//////////////////////////////////////////////////
std::ifstream fin(file);
getline(fin,line); // read one line and insist is
removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl;
assert(line==std::string("BEGIN_HEADER"));
do {
getline(fin,line); // read one line
std::cout << GridLogMessage << "* "<<line<< std::endl;
int eq = line.find("=");
if(eq >0) {
std::string key=line.substr(0,eq);
std::string val=line.substr(eq+1);
removeWhitespace(key);
removeWhitespace(val);
header[key] = val;
}
} while( line.find("END_HEADER") == std::string::npos );
field.data_start = fin.tellg();
//////////////////////////////////////////////////
// chomp the values
//////////////////////////////////////////////////
field.hdr_version = header["HDR_VERSION"];
field.data_type = header["DATATYPE"];
field.storage_format = header["STORAGE_FORMAT"];
field.dimension[0] = std::stol(header["DIMENSION_1"]);
field.dimension[1] = std::stol(header["DIMENSION_2"]);
field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]);
assert(grid->_ndimension == 4);
for(int d=0;d<4;d++){
assert(grid->_fdimensions[d]==field.dimension[d]);
}
field.link_trace = std::stod(header["LINK_TRACE"]);
field.plaquette = std::stod(header["PLAQUETTE"]);
field.boundary[0] = header["BOUNDARY_1"];
field.boundary[1] = header["BOUNDARY_2"];
field.boundary[2] = header["BOUNDARY_3"];
field.boundary[3] = header["BOUNDARY_4"];
field.checksum = std::stoul(header["CHECKSUM"],0,16);
field.ensemble_id = header["ENSEMBLE_ID"];
field.ensemble_label = header["ENSEMBLE_LABEL"];
field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]);
field.creator = header["CREATOR"];
field.creator_hardware = header["CREATOR_HARDWARE"];
field.creation_date = header["CREATION_DATE"];
field.archive_date = header["ARCHIVE_DATE"];
field.floating_point = header["FLOATING_POINT"];
return field.data_start;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header,
std::string file)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu._grid;
uint64_t offset = readHeader(file,Umu._grid,header);
FieldMetaData clone(header);
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
}
GaugeStatistics(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
<<" header "<<header.plaquette<<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
<<" header "<<header.link_trace<<std::endl;
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl;
std::cout << Umu[0]<<std::endl;
std::cout << Umu[1]<<std::endl;
}
if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl;
std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
std::string file,
int two_row,
int bits32)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
GridBase *grid = Umu._grid;
GridMetaData(grid,header);
assert(header.nd==4);
GaugeStatistics(Umu,header);
MachineCharacteristics(header);
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
}
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum
<<std::dec<<" plaq "<< header.plaquette <<std::endl;
}
///////////////////////////////
// RNG state
///////////////////////////////
static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
// Following should become arguments
FieldMetaData header;
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
GridBase *grid = parallel._grid;
GridMetaData(grid,header);
assert(header.nd==4);
header.link_trace=0.0;
header.plaquette=0.0;
MachineCharacteristics(header);
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
#endif
#ifdef RNG_MT19937
header.floating_point = std::string("UINT32");
header.data_type = std::string("MT19937");
#endif
#ifdef RNG_SITMO
header.floating_point = std::string("UINT64");
header.data_type = std::string("SITMO");
#endif
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
offset = writeHeader(header,file);
}
std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum "
<<std::hex<<header.checksum
<<std::dec<<std::endl;
}
static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
GridBase *grid = parallel._grid;
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);
std::string format(header.floating_point);
std::string data_type(header.data_type);
#ifdef RNG_RANLUX
assert(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48"));
#endif
#ifdef RNG_MT19937
assert(format == std::string("UINT32"));
assert(data_type == std::string("MT19937"));
#endif
#ifdef RNG_SITMO
assert(format == std::string("UINT64"));
assert(data_type == std::string("SITMO"));
#endif
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
if ( nersc_csum != header.checksum ) {
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0);
}
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
}
};
}}
#endif

75
Grid/perfmon/PerfCount.cc Normal file
View File

@ -0,0 +1,75 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/PerfCount.cc
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h>
namespace Grid {
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
#ifdef __linux__
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4
#ifdef KNL
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
// 11
#else
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
// 11
#endif
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
//15
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS}
//19
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif
};
}

245
Grid/perfmon/PerfCount.h Normal file
View File

@ -0,0 +1,245 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/PerfCount.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <peterboyle@MacBook-Pro.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H
#include <sys/time.h>
#include <ctime>
#include <chrono>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#ifdef __linux__
#include <syscall.h>
#include <linux/perf_event.h>
#else
#include <sys/syscall.h>
#endif
#ifdef __x86_64__
#include <x86intrin.h>
#endif
namespace Grid {
#ifdef __linux__
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret=0;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
#endif
#ifdef TIMERS_OFF
inline uint64_t cyclecount(void){
return 0;
}
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
#define __SSC_STOP __SSC_MARK(0x110)
#define __SSC_START __SSC_MARK(0x111)
#else
#define __SSC_MARK(mark)
#define __SSC_STOP
#define __SSC_START
/*
* cycle counters arch dependent
*/
#ifdef __bgq__
inline uint64_t cyclecount(void){
uint64_t tmp;
asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) );
return tmp;
}
#elif defined __x86_64__
inline uint64_t cyclecount(void){
return __rdtsc();
// unsigned int dummy;
// return __rdtscp(&dummy);
}
#else
inline uint64_t cyclecount(void){
return 0;
}
#endif
#endif
class PerformanceCounter {
private:
typedef struct {
public:
uint32_t type;
uint64_t config;
const char *name;
int normalisation;
} PerformanceCounterConfig;
static const PerformanceCounterConfig PerformanceCounterConfigs [];
public:
enum PerformanceCounterType {
CACHE_REFERENCES=0,
CACHE_MISSES=1,
CPUCYCLES=2,
INSTRUCTIONS=3,
L1D_READ_ACCESS=4,
PERFORMANCE_COUNTER_NUM_TYPES=19
};
public:
int PCT;
long long count;
long long cycles;
int fd;
int cyclefd;
unsigned long long elapsed;
uint64_t begin;
static int NumTypes(void){
return PERFORMANCE_COUNTER_NUM_TYPES;
}
PerformanceCounter(int _pct) {
#ifdef __linux__
assert(_pct>=0);
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
fd=-1;
cyclefd=-1;
count=0;
cycles=0;
PCT =_pct;
Open();
#endif
}
void Open(void)
{
#ifdef __linux__
struct perf_event_attr pe;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.size = sizeof(struct perf_event_attr);
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
pe.inherit = 1;
pe.type = PerformanceCounterConfigs[PCT].type;
pe.config= PerformanceCounterConfigs[PCT].config;
const char * name = PerformanceCounterConfigs[PCT].name;
fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
perror("Error is");
}
int norm = PerformanceCounterConfigs[PCT].normalisation;
pe.type = PerformanceCounterConfigs[norm].type;
pe.config= PerformanceCounterConfigs[norm].config;
name = PerformanceCounterConfigs[norm].name;
cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
if (cyclefd == -1) {
fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name);
perror("Error is");
}
#endif
}
void Start(void)
{
#ifdef __linux__
if ( fd!= -1) {
::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
}
begin =cyclecount();
#else
begin = 0;
#endif
}
void Stop(void) {
count=0;
cycles=0;
#ifdef __linux__
ssize_t ign;
if ( fd!= -1) {
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
ign=::read(fd, &count, sizeof(long long));
ign+=::read(cyclefd, &cycles, sizeof(long long));
assert(ign=2*sizeof(long long));
}
elapsed = cyclecount() - begin;
#else
elapsed = 0;
#endif
}
void Report(void) {
#ifdef __linux__
int N = PerformanceCounterConfigs[PCT].normalisation;
const char * sn = PerformanceCounterConfigs[N].name ;
const char * sc = PerformanceCounterConfigs[PCT].name;
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
sc, count, sc,sn, (double)count/(double)cycles);
#else
std::printf("%llu cycles \n", elapsed );
#endif
}
~PerformanceCounter()
{
#ifdef __linux__
::close(fd); ::close(cyclefd);
#endif
}
};
}
#endif

245
Grid/perfmon/Stat.cc Normal file
View File

@ -0,0 +1,245 @@
#include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Stat.h>
namespace Grid {
bool PmuStat::pmu_initialized=false;
void PmuStat::init(const char *regname)
{
#ifdef __x86_64__
name = regname;
if (!pmu_initialized)
{
std::cout<<"initialising pmu"<<std::endl;
pmu_initialized = true;
pmu_init();
}
clear();
#endif
}
void PmuStat::clear(void)
{
#ifdef __x86_64__
count = 0;
tregion = 0;
pmc0 = 0;
pmc1 = 0;
inst = 0;
cyc = 0;
ref = 0;
tcycles = 0;
reads = 0;
writes = 0;
#endif
}
void PmuStat::print(void)
{
#ifdef __x86_64__
std::cout <<"Reg "<<std::string(name)<<":\n";
std::cout <<" region "<<tregion<<std::endl;
std::cout <<" cycles "<<tcycles<<std::endl;
std::cout <<" inst "<<inst <<std::endl;
std::cout <<" cyc "<<cyc <<std::endl;
std::cout <<" ref "<<ref <<std::endl;
std::cout <<" pmc0 "<<pmc0 <<std::endl;
std::cout <<" pmc1 "<<pmc1 <<std::endl;
std::cout <<" count "<<count <<std::endl;
std::cout <<" reads "<<reads <<std::endl;
std::cout <<" writes "<<writes <<std::endl;
#endif
}
void PmuStat::start(void)
{
#ifdef __x86_64__
pmu_start();
++count;
xmemctrs(&mrstart, &mwstart);
tstart = __rdtsc();
#endif
}
void PmuStat::enter(int t)
{
#ifdef __x86_64__
counters[0][t] = __rdpmc(0);
counters[1][t] = __rdpmc(1);
counters[2][t] = __rdpmc((1<<30)|0);
counters[3][t] = __rdpmc((1<<30)|1);
counters[4][t] = __rdpmc((1<<30)|2);
counters[5][t] = __rdtsc();
#endif
}
void PmuStat::exit(int t)
{
#ifdef __x86_64__
counters[0][t] = __rdpmc(0) - counters[0][t];
counters[1][t] = __rdpmc(1) - counters[1][t];
counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
counters[5][t] = __rdtsc() - counters[5][t];
#endif
}
void PmuStat::accum(int nthreads)
{
#ifdef __x86_64__
tend = __rdtsc();
xmemctrs(&mrend, &mwend);
pmu_stop();
for (int t = 0; t < nthreads; ++t) {
pmc0 += counters[0][t];
pmc1 += counters[1][t];
inst += counters[2][t];
cyc += counters[3][t];
ref += counters[4][t];
tcycles += counters[5][t];
}
uint64_t region = tend - tstart;
tregion += region;
uint64_t mreads = mrend - mrstart;
reads += mreads;
uint64_t mwrites = mwend - mwstart;
writes += mwrites;
#endif
}
void PmuStat::pmu_fini(void) {}
void PmuStat::pmu_start(void) {};
void PmuStat::pmu_stop(void) {};
void PmuStat::pmu_init(void)
{
#ifdef _KNIGHTS_LANDING_
KNLsetup();
#endif
}
void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
{
#ifdef _KNIGHTS_LANDING_
ctrs c;
KNLreadctrs(c);
uint64_t emr = 0, emw = 0;
for (int i = 0; i < NEDC; ++i)
{
emr += c.edcrd[i];
emw += c.edcwr[i];
}
*mr = emr;
*mw = emw;
#else
*mr = *mw = 0;
#endif
}
#ifdef _KNIGHTS_LANDING_
struct knl_gbl_ PmuStat::gbl;
#define PMU_MEM
void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
{
char fname[1024];
snprintf(fname, sizeof(fname), "%s/type", ename);
FILE *fp = fopen(fname, "r");
if (fp == 0) {
::printf("open %s", fname);
::exit(0);
}
int type;
int ret = fscanf(fp, "%d", &type);
assert(ret == 1);
fclose(fp);
// std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
struct perf_event_attr hw = {};
hw.size = sizeof(hw);
hw.type = type;
// see /sys/devices/uncore_*/format/*
// All of the events we are interested in are configured the same way, but
// that isn't always true. Proper code would parse the format files
hw.config = event | (umask << 8);
//hw.read_format = PERF_FORMAT_GROUP;
// unfortunately the above only works within a single PMU; might
// as well just read them one at a time
int cpu = 0;
fd = perf_event_open(&hw, -1, cpu, -1, 0);
if (fd == -1) {
::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
::exit(0);
} else {
// std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
}
}
void PmuStat::KNLsetup(void){
int ret;
char fname[1024];
// MC RPQ inserts and WPQ inserts (reads & writes)
for (int mc = 0; mc < NMC; ++mc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
// RPQ Inserts
KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
// WPQ Inserts
KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
}
// EDC RPQ inserts and WPQ inserts
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
// RPQ inserts
KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
// WPQ inserts
KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
}
// EDC HitE, HitM, MissE, MissM
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
}
}
uint64_t PmuStat::KNLreadctr(int fd)
{
uint64_t data;
size_t s = ::read(fd, &data, sizeof(data));
if (s != sizeof(uint64_t)){
::printf("read counter %lu", s);
::exit(0);
}
return data;
}
void PmuStat::KNLreadctrs(ctrs &c)
{
for (int i = 0; i < NMC; ++i)
{
c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
}
for (int i = 0; i < NEDC; ++i)
{
c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
}
for (int i = 0; i < NEDC; ++i)
{
c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
}
}
#endif
}

104
Grid/perfmon/Stat.h Normal file
View File

@ -0,0 +1,104 @@
#ifndef _GRID_STAT_H
#define _GRID_STAT_H
#ifdef AVX512
#define _KNIGHTS_LANDING_ROOTONLY
#endif
namespace Grid {
///////////////////////////////////////////////////////////////////////////////
// Extra KNL counters from MCDRAM
///////////////////////////////////////////////////////////////////////////////
#ifdef _KNIGHTS_LANDING_
#define NMC 6
#define NEDC 8
struct ctrs
{
uint64_t mcrd[NMC];
uint64_t mcwr[NMC];
uint64_t edcrd[NEDC];
uint64_t edcwr[NEDC];
uint64_t edchite[NEDC];
uint64_t edchitm[NEDC];
uint64_t edcmisse[NEDC];
uint64_t edcmissm[NEDC];
};
// Peter/Azusa:
// Our modification of a code provided by Larry Meadows from Intel
// Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
// so is already public and in the linux kernel for KNL.
struct knl_gbl_
{
int mc_rd[NMC];
int mc_wr[NMC];
int edc_rd[NEDC];
int edc_wr[NEDC];
int edc_hite[NEDC];
int edc_hitm[NEDC];
int edc_misse[NEDC];
int edc_missm[NEDC];
};
#endif
///////////////////////////////////////////////////////////////////////////////
class PmuStat
{
uint64_t counters[8][256];
#ifdef _KNIGHTS_LANDING_
static struct knl_gbl_ gbl;
#endif
const char *name;
uint64_t reads; // memory reads
uint64_t writes; // memory writes
uint64_t mrstart; // memory read counter at start of parallel region
uint64_t mrend; // memory read counter at end of parallel region
uint64_t mwstart; // memory write counter at start of parallel region
uint64_t mwend; // memory write counter at end of parallel region
// cumulative counters
uint64_t count; // number of invocations
uint64_t tregion; // total time in parallel region (from thread 0)
uint64_t tcycles; // total cycles inside parallel region
uint64_t inst, ref, cyc; // fixed counters
uint64_t pmc0, pmc1;// pmu
// add memory counters here
// temp variables
uint64_t tstart; // tsc at start of parallel region
uint64_t tend; // tsc at end of parallel region
// map for ctrs values
// 0 pmc0 start
// 1 pmc0 end
// 2 pmc1 start
// 3 pmc1 end
// 4 tsc start
// 5 tsc end
static bool pmu_initialized;
public:
static bool is_init(void){ return pmu_initialized;}
static void pmu_init(void);
static void pmu_fini(void);
static void pmu_start(void);
static void pmu_stop(void);
void accum(int nthreads);
static void xmemctrs(uint64_t *mr, uint64_t *mw);
void start(void);
void enter(int t);
void exit(int t);
void print(void);
void init(const char *regname);
void clear(void);
#ifdef _KNIGHTS_LANDING_
static void KNLsetup(void);
static uint64_t KNLreadctr(int fd);
static void KNLreadctrs(ctrs &c);
static void KNLevsetup(const char *ename, int &fd, int event, int umask);
#endif
};
}
#endif

111
Grid/perfmon/Timer.h Normal file
View File

@ -0,0 +1,111 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Timer.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_TIME_H
#define GRID_TIME_H
#include <sys/time.h>
#include <ctime>
#include <chrono>
namespace Grid {
// Dress the output; use std::chrono
// C++11 time facilities better?
inline double usecond(void) {
struct timeval tv;
#ifdef TIMERS_ON
gettimeofday(&tv,NULL);
#endif
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
typedef std::chrono::system_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridTime;
typedef std::chrono::microseconds GridUsecs;
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
{
stream << time.count()<<" ms";
return stream;
}
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
{
stream << time.count()<<" usec";
return stream;
}
class GridStopWatch {
private:
bool running;
GridTimePoint start;
GridUsecs accumulator;
public:
GridStopWatch () {
Reset();
}
void Start(void) {
assert(running == false);
#ifdef TIMERS_ON
start = GridClock::now();
#endif
running = true;
}
void Stop(void) {
assert(running == true);
#ifdef TIMERS_ON
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
#endif
running = false;
};
void Reset(void){
running = false;
#ifdef TIMERS_ON
start = GridClock::now();
#endif
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
}
GridTime Elapsed(void) {
assert(running == false);
return std::chrono::duration_cast<GridTime>( accumulator );
}
uint64_t useconds(void){
assert(running == false);
return (uint64_t) accumulator.count();
}
bool isRunning(void){
return running;
}
};
}
#endif

View File

@ -0,0 +1,74 @@
/**
* pugixml parser - version 1.9
* --------------------------------------------------------
* Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at http://pugixml.org/
*
* This library is distributed under the MIT License. See notice at the end
* of this file.
*
* This work is based on the pugxml parser, which is:
* Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
*/
#ifndef HEADER_PUGICONFIG_HPP
#define HEADER_PUGICONFIG_HPP
// Uncomment this to enable wchar_t mode
// #define PUGIXML_WCHAR_MODE
// Uncomment this to enable compact mode
// #define PUGIXML_COMPACT
// Uncomment this to disable XPath
// #define PUGIXML_NO_XPATH
// Uncomment this to disable STL
// #define PUGIXML_NO_STL
// Uncomment this to disable exceptions
// #define PUGIXML_NO_EXCEPTIONS
// Set this to control attributes for public classes/functions, i.e.:
// #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
// #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
// #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
// In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
// Tune these constants to adjust memory-related behavior
// #define PUGIXML_MEMORY_PAGE_SIZE 32768
// #define PUGIXML_MEMORY_OUTPUT_STACK 10240
// #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
// Uncomment this to switch to header-only version
// #define PUGIXML_HEADER_ONLY
// Uncomment this to enable long long support
// #define PUGIXML_HAS_LONG_LONG
#endif
/**
* Copyright (c) 2006-2018 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/

12796
Grid/pugixml/pugixml.cc Normal file

File diff suppressed because it is too large Load Diff

1461
Grid/pugixml/pugixml.h Normal file

File diff suppressed because it is too large Load Diff

52
Grid/pugixml/readme.txt Normal file
View File

@ -0,0 +1,52 @@
pugixml 1.9 - an XML processing library
Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
Report bugs and download new versions at http://pugixml.org/
This is the distribution of pugixml, which is a C++ XML processing library,
which consists of a DOM-like interface with rich traversal/modification
capabilities, an extremely fast XML parser which constructs the DOM tree from
an XML file/buffer, and an XPath 1.0 implementation for complex data-driven
tree queries. Full Unicode support is also available, with Unicode interface
variants and conversions between different Unicode encodings (which happen
automatically during parsing/saving).
The distribution contains the following folders:
contrib/ - various contributions to pugixml
docs/ - documentation
docs/samples - pugixml usage examples
docs/quickstart.html - quick start guide
docs/manual.html - complete manual
scripts/ - project files for IDE/build systems
src/ - header and source files
readme.txt - this file.
This library is distributed under the MIT License:
Copyright (c) 2006-2018 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

124
Grid/qcd/LatticeTheories.h Normal file
View File

@ -0,0 +1,124 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/QCD.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LT_H
#define GRID_LT_H
namespace Grid{
// First steps in the complete generalization of the Physics part
// Design not final
namespace LatticeTheories {
template <int Dimensions>
struct LatticeTheory {
static const int Nd = Dimensions;
static const int Nds = Dimensions * 2; // double stored field
template <typename vtype>
using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
};
template <int Dimensions, int Colours>
struct LatticeGaugeTheory : public LatticeTheory<Dimensions> {
static const int Nds = Dimensions * 2;
static const int Nd = Dimensions;
static const int Nc = Colours;
template <typename vtype>
using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > >;
template <typename vtype>
using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd>;
template <typename vtype>
using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
template <typename vtype>
using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
};
template <int Dimensions, int Colours, int Spin>
struct FermionicLatticeGaugeTheory
: public LatticeGaugeTheory<Dimensions, Colours> {
static const int Nd = Dimensions;
static const int Nds = Dimensions * 2;
static const int Nc = Colours;
static const int Ns = Spin;
template <typename vtype>
using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template <typename vtype>
using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template <typename vtype>
using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template <typename vtype>
using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
// These 2 only if Spin is a multiple of 2
static const int Nhs = Spin / 2;
template <typename vtype>
using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template <typename vtype>
using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
//tests
typedef iColourMatrix<Complex> ColourMatrix;
typedef iColourMatrix<ComplexF> ColourMatrixF;
typedef iColourMatrix<ComplexD> ColourMatrixD;
};
// Examples, not complete now.
struct QCD : public FermionicLatticeGaugeTheory<4, 3, 4> {
static const int Xp = 0;
static const int Yp = 1;
static const int Zp = 2;
static const int Tp = 3;
static const int Xm = 4;
static const int Ym = 5;
static const int Zm = 6;
static const int Tm = 7;
typedef FermionicLatticeGaugeTheory FLGT;
typedef FLGT::iSpinMatrix<Complex > SpinMatrix;
typedef FLGT::iSpinMatrix<ComplexF > SpinMatrixF;
typedef FLGT::iSpinMatrix<ComplexD > SpinMatrixD;
};
struct QED : public FermionicLatticeGaugeTheory<4, 1, 4> {//fill
};
template <int Dimensions>
struct Scalar : public LatticeTheory<Dimensions> {};
}; // LatticeTheories
} // Grid
#endif

511
Grid/qcd/QCD.h Normal file
View File

@ -0,0 +1,511 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/QCD.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_BASE_H
#define GRID_QCD_BASE_H
namespace Grid{
namespace QCD {
static const int Xdir = 0;
static const int Ydir = 1;
static const int Zdir = 2;
static const int Tdir = 3;
static const int Xp = 0;
static const int Yp = 1;
static const int Zp = 2;
static const int Tp = 3;
static const int Xm = 4;
static const int Ym = 5;
static const int Zm = 6;
static const int Tm = 7;
static const int Nc=3;
static const int Ns=4;
static const int Nd=4;
static const int Nhs=2; // half spinor
static const int Nds=8; // double stored gauge field
static const int Ngp=2; // gparity index range
//////////////////////////////////////////////////////////////////////////////
// QCD iMatrix types
// Index conventions: Lorentz x Spin x Colour
// note: static const int or constexpr will work for type deductions
// with the intel compiler (up to version 17)
//////////////////////////////////////////////////////////////////////////////
#define ColourIndex 2
#define SpinIndex 1
#define LorentzIndex 0
// Also should make these a named enum type
static const int DaggerNo=0;
static const int DaggerYes=1;
static const int InverseNo=0;
static const int InverseYes=1;
// Useful traits is this a spin index
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
const int SpinorIndex = 2;
template<typename T> struct isSpinor {
static const bool value = (SpinorIndex==T::TensorLevel);
};
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
// ChrisK very keen to add extra space for Gparity doubling.
//
// Also add domain wall index, in a way where Wilson operator
// naturally distributes across the 5th dimensions.
//
// That probably makes for GridRedBlack4dCartesian grid.
// s,sp,c,spc,lc
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
// Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix;
typedef iSpinMatrix<ComplexF > SpinMatrixF;
typedef iSpinMatrix<ComplexD > SpinMatrixD;
typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iSpinMatrix<vComplexF> vSpinMatrixF;
typedef iSpinMatrix<vComplexD> vSpinMatrixD;
// Colour Matrix
typedef iColourMatrix<Complex > ColourMatrix;
typedef iColourMatrix<ComplexF > ColourMatrixF;
typedef iColourMatrix<ComplexD > ColourMatrixD;
typedef iColourMatrix<vComplex > vColourMatrix;
typedef iColourMatrix<vComplexF> vColourMatrixF;
typedef iColourMatrix<vComplexD> vColourMatrixD;
// SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iSpinColourMatrix<ComplexF > SpinColourMatrixF;
typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
// LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
// DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
// Spin vector
typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF;
typedef iSpinVector<ComplexD> SpinVectorD;
typedef iSpinVector<vComplex > vSpinVector;
typedef iSpinVector<vComplexF> vSpinVectorF;
typedef iSpinVector<vComplexD> vSpinVectorD;
// Colour vector
typedef iColourVector<Complex > ColourVector;
typedef iColourVector<ComplexF> ColourVectorF;
typedef iColourVector<ComplexD> ColourVectorD;
typedef iColourVector<vComplex > vColourVector;
typedef iColourVector<vComplexF> vColourVectorF;
typedef iColourVector<vComplexD> vColourVectorD;
// SpinColourVector
typedef iSpinColourVector<Complex > SpinColourVector;
typedef iSpinColourVector<ComplexF> SpinColourVectorF;
typedef iSpinColourVector<ComplexD> SpinColourVectorD;
typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSpinColourVector<vComplexF> vSpinColourVectorF;
typedef iSpinColourVector<vComplexD> vSpinColourVectorD;
// HalfSpin vector
typedef iHalfSpinVector<Complex > HalfSpinVector;
typedef iHalfSpinVector<ComplexF> HalfSpinVectorF;
typedef iHalfSpinVector<ComplexD> HalfSpinVectorD;
typedef iHalfSpinVector<vComplex > vHalfSpinVector;
typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF;
typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD;
// HalfSpinColour vector
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
// singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure
typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure
typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealD> TRealD; // Shouldn't need these; can I make it work without?
typedef iSinglet<vReal > vTReal;
typedef iSinglet<vRealF> vTRealF;
typedef iSinglet<vRealD> vTRealD;
typedef iSinglet<vInteger> vTInteger;
typedef iSinglet<Integer > TInteger;
// Lattices of these
typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vColourMatrixF> LatticeColourMatrixF;
typedef Lattice<vColourMatrixD> LatticeColourMatrixD;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF;
typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF;
typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD;
typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
// DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vSpinVectorF> LatticeSpinVectorF;
typedef Lattice<vSpinVectorD> LatticeSpinVectorD;
typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vColourVectorF> LatticeColourVectorF;
typedef Lattice<vColourVectorD> LatticeColourVectorD;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF;
typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF;
typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTRealF> LatticeRealF;
typedef Lattice<vTRealD> LatticeRealD;
typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTComplexF> LatticeComplexF;
typedef Lattice<vTComplexD> LatticeComplexD;
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
///////////////////////////////////////////
// Physical names for things
///////////////////////////////////////////
typedef LatticeHalfSpinColourVector LatticeHalfFermion;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
typedef LatticeSpinColourVector LatticeFermion;
typedef LatticeSpinColourVectorF LatticeFermionF;
typedef LatticeSpinColourVectorD LatticeFermionD;
typedef LatticeSpinColourMatrix LatticePropagator;
typedef LatticeSpinColourMatrixF LatticePropagatorF;
typedef LatticeSpinColourMatrixD LatticePropagatorD;
typedef LatticeLorentzColourMatrix LatticeGaugeField;
typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF;
typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD;
typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField;
typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF;
typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD;
template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
// Uhgg... typing this hurt ;)
// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourVectorF> LatticeStaggeredFermionF;
typedef Lattice<vColourVectorD> LatticeStaggeredFermionD;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF;
typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD;
//////////////////////////////////////////////////////////////////////////////
// Peek and Poke named after physics attributes
//////////////////////////////////////////////////////////////////////////////
//spin
template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{
return PeekIndex<SpinIndex>(rhs,i);
}
template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{
return PeekIndex<SpinIndex>(rhs,i,j);
}
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{
return PeekIndex<SpinIndex>(rhs,i);
}
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{
return PeekIndex<SpinIndex>(rhs,i,j);
}
//colour
template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{
return PeekIndex<ColourIndex>(rhs,i);
}
template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{
return PeekIndex<ColourIndex>(rhs,i,j);
}
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{
return PeekIndex<ColourIndex>(rhs,i);
}
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{
return PeekIndex<ColourIndex>(rhs,i,j);
}
//lorentz
template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{
return PeekIndex<LorentzIndex>(rhs,i);
}
template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{
return PeekIndex<LorentzIndex>(rhs,i);
}
//////////////////////////////////////////////
// Poke lattice
//////////////////////////////////////////////
template<class vobj>
void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
int i)
{
PokeIndex<ColourIndex>(lhs,rhs,i);
}
template<class vobj>
void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
int i,int j)
{
PokeIndex<ColourIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
int i)
{
PokeIndex<SpinIndex>(lhs,rhs,i);
}
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
int i,int j)
{
PokeIndex<SpinIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeLorentz(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
int i)
{
PokeIndex<LorentzIndex>(lhs,rhs,i);
}
//////////////////////////////////////////////
// Poke scalars
//////////////////////////////////////////////
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<SpinIndex>(lhs,rhs,i);
}
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
{
pokeIndex<SpinIndex>(lhs,rhs,i,j);
}
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<ColourIndex>(lhs,rhs,i);
}
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
{
pokeIndex<ColourIndex>(lhs,rhs,i,j);
}
template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
{
pokeIndex<LorentzIndex>(lhs,rhs,i);
}
//////////////////////////////////////////////
// Fermion <-> propagator assignements
//////////////////////////////////////////////
//template <class Prop, class Ferm>
template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i)
{
pokeColour(pjs, peekColour(fj, i), i, c);
}
pokeSpin(p, pjs, j, s);
}
}
//template <class Prop, class Ferm>
template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i)
{
pokeColour(fj, peekColour(pjs, i, c), i);
}
pokeSpin(f, fj, j);
}
}
//////////////////////////////////////////////
// transpose array and scalar
//////////////////////////////////////////////
template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
return transposeIndex<SpinIndex>(lhs);
}
template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
return transposeIndex<ColourIndex>(lhs);
}
template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
return transposeIndex<SpinIndex>(lhs);
}
template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
return transposeIndex<ColourIndex>(lhs);
}
//////////////////////////////////////////
// Trace lattice and non-lattice
//////////////////////////////////////////
template<int Index,class vobj>
inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))>
{
return traceIndex<SpinIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))>
{
return traceIndex<ColourIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
{
return traceIndex<SpinIndex>(lhs);
}
template<int Index,class vobj>
inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
{
return traceIndex<ColourIndex>(lhs);
}
//////////////////////////////////////////
// Current types
//////////////////////////////////////////
GRID_SERIALIZABLE_ENUM(Current, undef,
Vector, 0,
Axial, 1,
Tadpole, 2);
} //namespace QCD
} // Grid
#endif

50
Grid/qcd/action/Action.h Normal file
View File

@ -0,0 +1,50 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/Actions.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_ACTION_H
#define GRID_QCD_ACTION_H
////////////////////////////////////////////
// Abstract base interface
////////////////////////////////////////////
#include <Grid/qcd/action/ActionCore.h>
////////////////////////////////////////////////////////////////////////
// Fermion actions; prevent coupling fermion.cc files to other headers
////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/Fermion.h>
////////////////////////////////////////
// Pseudo fermion combinations for HMC
////////////////////////////////////////
#include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
#endif

View File

@ -0,0 +1,56 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/ActionBase.h
Copyright (C) 2015-2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef ACTION_BASE_H
#define ACTION_BASE_H
namespace Grid {
namespace QCD {
template <class GaugeField >
class Action
{
public:
bool is_smeared = false;
// Heatbath?
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name
virtual std::string LogParameters() = 0; // prints action parameters
virtual ~Action(){}
};
}
}
#endif // ACTION_BASE_H

View File

@ -0,0 +1,61 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/ActionCore.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_ACTION_CORE
#define QCD_ACTION_CORE
#include <Grid/qcd/action/ActionBase.h>
#include <Grid/qcd/action/ActionSet.h>
#include <Grid/qcd/action/ActionParams.h>
////////////////////////////////////////////
// Gauge Actions
////////////////////////////////////////////
#include <Grid/qcd/action/gauge/Gauge.h>
////////////////////////////////////////////
// Fermion prereqs
////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h>
////////////////////////////////////////////
// Scalar Actions
////////////////////////////////////////////
#include <Grid/qcd/action/scalar/Scalar.h>
////////////////////////////////////////////
// Utility functions
////////////////////////////////////////////
#include <Grid/qcd/utils/Metric.h>
#include <Grid/qcd/utils/CovariantLaplacian.h>
#endif

View File

@ -0,0 +1,92 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/ActionParams.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_ACTION_PARAMS_H
#define GRID_QCD_ACTION_PARAMS_H
namespace Grid {
namespace QCD {
// These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams {
bool overlapCommsCompute;
std::vector<int> twists;
GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){};
};
struct WilsonImplParams {
bool overlapCommsCompute;
std::vector<Complex> boundary_phases;
WilsonImplParams() : overlapCommsCompute(false) {
boundary_phases.resize(Nd, 1.0);
};
WilsonImplParams(const std::vector<Complex> phi)
: boundary_phases(phi), overlapCommsCompute(false) {}
};
struct StaggeredImplParams {
StaggeredImplParams() {};
};
struct OneFlavourRationalParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams,
RealD, lo,
RealD, hi,
int, MaxIter,
RealD, tolerance,
int, degree,
int, precision);
// MaxIter and tolerance, vectors??
// constructor
OneFlavourRationalParams( RealD _lo = 0.0,
RealD _hi = 1.0,
int _maxit = 1000,
RealD tol = 1.0e-8,
int _degree = 10,
int _precision = 64)
: lo(_lo),
hi(_hi),
MaxIter(_maxit),
tolerance(tol),
degree(_degree),
precision(_precision){};
};
}
}
#endif

Some files were not shown because too many files have changed in this diff Show More