1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-13 01:05:36 +00:00

Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion

Peter's GPU branch changes merged with A2A CI code
This commit is contained in:
Fionn O hOgain 2019-09-30 16:53:44 +01:00
commit d1daab601a
785 changed files with 41312 additions and 51680 deletions

View File

@ -30,8 +30,34 @@ directory
#ifndef DISABLE_WARNINGS_H #ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H #define DISABLE_WARNINGS_H
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
//disables and intel compiler specific warning (in json.hpp) //disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488 #pragma warning disable 488
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC
#ifdef __ALTIVEC__
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef __VSX__
#define EIGEN_DONT_VECTORIZE
#endif
#endif #endif

View File

@ -38,16 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_BASE_H #ifndef GRID_BASE_H
#define GRID_BASE_H #define GRID_BASE_H
#include <Grid/GridStd.h>
#include <Grid/DisableWarnings.h>
#include <Grid/Namespace.h>
#include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h> #include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h> #include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h> #include <Grid/threads/Threads.h>
#include <Grid/util/Util.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h> #include <Grid/cartesian/Cartesian.h>
@ -57,5 +60,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/stencil/Stencil.h> #include <Grid/stencil/Stencil.h>
#include <Grid/parallelIO/BinaryIO.h> #include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h> #include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore)
#endif #endif

View File

@ -38,5 +38,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);
#endif #endif

View File

@ -7,6 +7,7 @@
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <vector> #include <vector>
#include <array>
#include <string> #include <string>
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>

View File

@ -1,14 +1,41 @@
#include <Grid/GridCore.h>
#pragma once #pragma once
// Force Eigen to use MKL if Grid has been configured with --enable-mkl // Force Eigen to use MKL if Grid has been configured with --enable-mkl
#ifdef USE_MKL #ifdef USE_MKL
#define EIGEN_USE_MKL_ALL #define EIGEN_USE_MKL_ALL
#endif #endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif #endif
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#pragma diag_suppress code_is_unreachable
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __NVCC__
#undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__
#endif
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
/* NVCC restore */
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

1
Grid/Grid_Eigen_Tensor.h Normal file
View File

@ -0,0 +1 @@
#include <Grid/Grid_Eigen_Dense.h>

38
Grid/Namespace.h Normal file
View File

@ -0,0 +1,38 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Namespace.h
Copyright (C) 2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <type_traits>
#include <cassert>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -26,501 +26,487 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H
namespace Grid { NAMESPACE_BEGIN(Grid);
class Geometry { class Geometry {
// int dimension; // int dimension;
public: public:
int npoint; int npoint;
std::vector<int> directions ; std::vector<int> directions ;
std::vector<int> displacements; std::vector<int> displacements;
Geometry(int _d) { Geometry(int _d) {
int base = (_d==5) ? 1:0; int base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d // make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4; if ( _d==5 ) _d=4;
npoint = 2*_d+1; npoint = 2*_d+1;
directions.resize(npoint); directions.resize(npoint);
displacements.resize(npoint); displacements.resize(npoint);
for(int d=0;d<_d;d++){ for(int d=0;d<_d;d++){
directions[2*d ] = d+base; directions[2*d ] = d+base;
directions[2*d+1] = d+base; directions[2*d+1] = d+base;
displacements[2*d ] = +1; displacements[2*d ] = +1;
displacements[2*d+1] = -1; displacements[2*d+1] = -1;
} }
directions [2*_d]=0; directions [2*_d]=0;
displacements[2*_d]=0; displacements[2*_d]=0;
//// report back //// report back
std::cout<<GridLogMessage<<"directions :"; std::cout<<GridLogMessage<<"directions :";
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " "; for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
std::cout <<std::endl; std::cout <<std::endl;
std::cout<<GridLogMessage<<"displacements :"; std::cout<<GridLogMessage<<"displacements :";
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " "; for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
std::cout<<std::endl; std::cout<<std::endl;
} }
/* /*
// Original cleaner code // Original cleaner code
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) { Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
for(int d=0;d<dimension;d++){ for(int d=0;d<dimension;d++){
directions[2*d ] = d; directions[2*d ] = d;
directions[2*d+1] = d; directions[2*d+1] = d;
displacements[2*d ] = +1; displacements[2*d ] = +1;
displacements[2*d+1] = -1; displacements[2*d+1] = -1;
} }
directions [2*dimension]=0; directions [2*dimension]=0;
displacements[2*dimension]=0; displacements[2*dimension]=0;
} }
std::vector<int> GetDelta(int point) { std::vector<int> GetDelta(int point) {
std::vector<int> delta(dimension,0); std::vector<int> delta(dimension,0);
delta[directions[point]] = displacements[point]; delta[directions[point]] = displacements[point];
return delta; return delta;
};
*/
}; };
*/
};
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
class Aggregation { class Aggregation {
public: public:
typedef iVector<CComplex,nbasis > siteVector; typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector; typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix; typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField; typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid), CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid), FineGrid(_FineGrid),
subspace(nbasis,_FineGrid), subspace(nbasis,_FineGrid),
checkerboard(_checkerboard) checkerboard(_checkerboard)
{ {
}; };
void Orthogonalise(void){ void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid); CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl; std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace); blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl; std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace); blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl; // std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal(); // CheckOrthogonal();
} }
void CheckOrthogonal(void){ void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid); CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid); CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace); blockProject(iProj,subspace[i],subspace);
eProj=zero; eProj=Zero();
parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){ thread_for(ss, CoarseGrid->oSites(),{
eProj._odata[ss](i)=CComplex(1.0); eProj[ss](i)=CComplex(1.0);
} });
eProj=eProj - iProj; eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl; std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
}
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
} }
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){ std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
blockProject(CoarseVec,FineVec,subspace); }
} void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){ blockProject(CoarseVec,FineVec,subspace);
FineVec.checkerboard = subspace[0].checkerboard; }
blockPromote(CoarseVec,FineVec,subspace); void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
} FineVec.Checkerboard() = subspace[0].Checkerboard();
void CreateSubspaceRandom(GridParallelRNG &RNG){ blockPromote(CoarseVec,FineVec,subspace);
for(int i=0;i<nbasis;i++){ }
random(RNG,subspace[i]); void CreateSubspaceRandom(GridParallelRNG &RNG){
std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl; for(int i=0;i<nbasis;i++){
} random(RNG,subspace[i]);
Orthogonalise(); std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
} }
Orthogonalise();
}
/* /*
virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{ {
// Run a Lanczos with sloppy convergence // Run a Lanczos with sloppy convergence
const int Nstop = nn; const int Nstop = nn;
const int Nk = nn+20; const int Nk = nn+20;
const int Np = nn+20; const int Np = nn+20;
const int Nm = Nk+Np; const int Nm = Nk+Np;
const int MaxIt= 10000; const int MaxIt= 10000;
RealD resid = 1.0e-3; RealD resid = 1.0e-3;
Chebyshev<FineField> Cheb(0.5,64.0,21); Chebyshev<FineField> Cheb(0.5,64.0,21);
ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt); ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
// IRL.lock = 1; // IRL.lock = 1;
FineField noise(FineGrid); gaussian(RNG,noise); FineField noise(FineGrid); gaussian(RNG,noise);
FineField tmp(FineGrid); FineField tmp(FineGrid);
std::vector<RealD> eval(Nm); std::vector<RealD> eval(Nm);
std::vector<FineField> evec(Nm,FineGrid); std::vector<FineField> evec(Nm,FineGrid);
int Nconv; int Nconv;
IRL.calc(eval,evec, IRL.calc(eval,evec,
noise, noise,
Nconv); Nconv);
// pull back nn vectors // pull back nn vectors
for(int b=0;b<nn;b++){ for(int b=0;b<nn;b++){
subspace[b] = evec[b]; subspace[b] = evec[b];
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl; std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
hermop.Op(subspace[b],tmp); hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl; std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
noise = tmp - sqrt(eval[b])*subspace[b] ; noise = tmp - sqrt(eval[b])*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl; std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
noise = tmp + eval[b]*subspace[b] ; noise = tmp + eval[b]*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl; std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
}
Orthogonalise();
for(int b=0;b<nn;b++){
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
}
} }
*/ Orthogonalise();
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) { for(int b=0;b<nn;b++){
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
}
}
*/
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale; RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,10000); ConjugateGradient<FineField> CG(1.0e-2,10000);
FineField noise(FineGrid); FineField noise(FineGrid);
FineField Mn(FineGrid); FineField Mn(FineGrid);
for(int b=0;b<nn;b++){ for(int b=0;b<nn;b++){
subspace[b] = zero; subspace[b] = Zero();
gaussian(RNG,noise); gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5); scale = std::pow(norm2(noise),-0.5);
noise=noise*scale; noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
} }
Orthogonalise(); hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
} }
};
// Fine Object == (per site) type of fine field Orthogonalise();
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis> }
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > { };
public: // Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > siteVector; typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector; typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix; typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField; typedef Lattice<Fobj > FineField;
//////////////////// ////////////////////
// Data members // Data members
//////////////////// ////////////////////
Geometry geom; Geometry geom;
GridBase * _grid; GridBase * _grid;
CartesianStencil<siteVector,siteVector> Stencil;
std::vector<CoarseMatrix> A; CartesianStencil<siteVector,siteVector,int> Stencil;
std::vector<CoarseMatrix> A;
/////////////////////// ///////////////////////
// Interface // Interface
/////////////////////// ///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
RealD M (const CoarseVector &in, CoarseVector &out){ RealD M (const CoarseVector &in, CoarseVector &out){
conformable(_grid,in._grid); conformable(_grid,in.Grid());
conformable(in._grid,out._grid); conformable(in.Grid(),out.Grid());
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
auto in_v = in.View();
auto out_v = in.View();
thread_for(ss,Grid()->oSites(),{
siteVector res = Zero();
siteVector nbr;
int ptype;
StencilEntry *SE;
for(int point=0;point<geom.npoint;point++){
parallel_for(int ss=0;ss<Grid()->oSites();ss++){ SE=Stencil.GetEntry(ptype,point,ss);
siteVector res = zero;
siteVector nbr;
int ptype;
StencilEntry *SE;
for(int point=0;point<geom.npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) { if(SE->_is_local&&SE->_permute) {
permute(nbr,in._odata[SE->_offset],ptype); permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) { } else if(SE->_is_local) {
nbr = in._odata[SE->_offset]; nbr = in_v[SE->_offset];
} else { } else {
nbr = Stencil.CommBuf()[SE->_offset]; nbr = Stencil.CommBuf()[SE->_offset];
}
res = res + A[point]._odata[ss]*nbr;
} }
vstream(out._odata[ss],res); auto A_point = A[point].View();
res = res + A_point[ss]*nbr;
} }
return norm2(out); vstream(out_v[ss],res);
}; });
return norm2(out);
};
RealD Mdag (const CoarseVector &in, CoarseVector &out){ RealD Mdag (const CoarseVector &in, CoarseVector &out){
// // corresponds to Petrov-Galerkin coarsening // // corresponds to Petrov-Galerkin coarsening
// return M(in,out); // return M(in,out);
// corresponds to Galerkin coarsening
CoarseVector tmp(Grid());
G5C(tmp, in);
M(tmp, out);
G5C(out, out);
return norm2(out);
};
// corresponds to Galerkin coarsening void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
CoarseVector tmp(Grid());
G5C(tmp, in); conformable(_grid,in.Grid());
M(tmp, out); conformable(in.Grid(),out.Grid());
G5C(out, out);
return norm2(out); SimpleCompressor<siteVector> compressor;
}; Stencil.HaloExchange(in,compressor);
auto point = [dir, disp](){
if(dir == 0 and disp == 0)
return 8;
else
return (4 * dir + 1 - disp) / 2;
}();
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){ auto out_v = out.View();
auto in_v = in.View();
conformable(_grid,in._grid); thread_for(ss,Grid()->oSites(),{
conformable(in._grid,out._grid); siteVector res = Zero();
siteVector nbr;
SimpleCompressor<siteVector> compressor; int ptype;
Stencil.HaloExchange(in,compressor); StencilEntry *SE;
auto point = [dir, disp](){ SE=Stencil.GetEntry(ptype,point,ss);
if(dir == 0 and disp == 0)
return 8; if(SE->_is_local&&SE->_permute) {
else permute(nbr,in_v[SE->_offset],ptype);
return (4 * dir + 1 - disp) / 2; } else if(SE->_is_local) {
}(); nbr = in_v[SE->_offset];
} else {
parallel_for(int ss=0;ss<Grid()->oSites();ss++){ nbr = Stencil.CommBuf()[SE->_offset];
siteVector res = zero;
siteVector nbr;
int ptype;
StencilEntry *SE;
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in._odata[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in._odata[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
res = res + A[point]._odata[ss]*nbr;
vstream(out._odata[ss],res);
} }
};
void Mdiag(const CoarseVector &in, CoarseVector &out){ auto A_point = A[point].View();
Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil res = res + A_point[ss]*nbr;
};
vstream(out_v[ss],res);
});
};
CoarsenedMatrix(GridCartesian &CoarseGrid) : void Mdiag(const CoarseVector &in, CoarseVector &out){
Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
};
_grid(&CoarseGrid),
geom(CoarseGrid._ndimension), CoarsenedMatrix(GridCartesian &CoarseGrid) :
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements),
A(geom.npoint,&CoarseGrid)
{
};
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, _grid(&CoarseGrid),
Aggregation<Fobj,CComplex,nbasis> & Subspace){ geom(CoarseGrid._ndimension),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid)
{
};
FineField iblock(FineGrid); // contributions from within this block void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
FineField oblock(FineGrid); // contributions from outwith this block Aggregation<Fobj,CComplex,nbasis> & Subspace){
FineField phi(FineGrid); FineField iblock(FineGrid); // contributions from within this block
FineField tmp(FineGrid); FineField oblock(FineGrid); // contributions from outwith this block
FineField zz(FineGrid); zz=zero;
FineField Mphi(FineGrid);
Lattice<iScalar<vInteger> > coor(FineGrid); FineField phi(FineGrid);
FineField tmp(FineGrid);
FineField zz(FineGrid); zz=Zero();
FineField Mphi(FineGrid);
CoarseVector iProj(Grid()); Lattice<iScalar<vInteger> > coor(FineGrid);
CoarseVector oProj(Grid());
CoarseScalar InnerProd(Grid());
// Orthogonalise the subblocks over the basis CoarseVector iProj(Grid());
blockOrthogonalise(InnerProd,Subspace.subspace); CoarseVector oProj(Grid());
CoarseScalar InnerProd(Grid());
// Compute the matrix elements of linop between this orthonormal // Orthogonalise the subblocks over the basis
// set of vectors. blockOrthogonalise(InnerProd,Subspace.subspace);
int self_stencil=-1;
for(int p=0;p<geom.npoint;p++){ // Compute the matrix elements of linop between this orthonormal
A[p]=zero; // set of vectors.
if( geom.displacements[p]==0){ int self_stencil=-1;
self_stencil=p; for(int p=0;p<geom.npoint;p++){
} A[p]=Zero();
if( geom.displacements[p]==0){
self_stencil=p;
} }
assert(self_stencil!=-1); }
assert(self_stencil!=-1);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
phi=Subspace.subspace[i]; phi=Subspace.subspace[i];
std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl; std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
for(int p=0;p<geom.npoint;p++){ for(int p=0;p<geom.npoint;p++){
int dir = geom.directions[p]; int dir = geom.directions[p];
int disp = geom.displacements[p]; int disp = geom.displacements[p];
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]); Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
LatticeCoordinate(coor,dir); LatticeCoordinate(coor,dir);
if ( disp==0 ){ if ( disp==0 ){
linop.OpDiag(phi,Mphi); linop.OpDiag(phi,Mphi);
}
else {
linop.OpDir(phi,Mphi,dir,disp);
}
////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell
////////////////////////////////////////////////////////////////////////
if ( disp==0 ) {
iblock = Mphi;
oblock = zero;
} else if ( disp==1 ) {
oblock = where(mod(coor,block)==(block-1),Mphi,zz);
iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
} else if ( disp==-1 ) {
oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
} else {
assert(0);
}
Subspace.ProjectToSubspace(iProj,iblock);
Subspace.ProjectToSubspace(oProj,oblock);
// blockProject(iProj,iblock,Subspace.subspace);
// blockProject(oProj,oblock,Subspace.subspace);
parallel_for(int ss=0;ss<Grid()->oSites();ss++){
for(int j=0;j<nbasis;j++){
if( disp!= 0 ) {
A[p]._odata[ss](j,i) = oProj._odata[ss](j);
}
A[self_stencil]._odata[ss](j,i) = A[self_stencil]._odata[ss](j,i) + iProj._odata[ss](j);
}
}
} }
else {
linop.OpDir(phi,Mphi,dir,disp);
}
////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell
////////////////////////////////////////////////////////////////////////
if ( disp==0 ) {
iblock = Mphi;
oblock = Zero();
} else if ( disp==1 ) {
oblock = where(mod(coor,block)==(block-1),Mphi,zz);
iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
} else if ( disp==-1 ) {
oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
} else {
assert(0);
}
Subspace.ProjectToSubspace(iProj,iblock);
Subspace.ProjectToSubspace(oProj,oblock);
// blockProject(iProj,iblock,Subspace.subspace);
// blockProject(oProj,oblock,Subspace.subspace);
auto iProj_v = iProj.View() ;
auto oProj_v = oProj.View() ;
auto A_p = A[p].View();
auto A_self = A[self_stencil].View();
thread_for(ss, Grid()->oSites(),{
for(int j=0;j<nbasis;j++){
if( disp!= 0 ) {
A_p[ss](j,i) = oProj_v[ss](j);
}
A_self[ss](j,i) = A_self[ss](j,i) + iProj_v[ss](j);
}
});
} }
}
#if 0 #if 0
/////////////////////////// ///////////////////////////
// test code worth preserving in if block // test code worth preserving in if block
/////////////////////////// ///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl; std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){ for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl; std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< A[p] << std::endl; std::cout<<GridLogMessage<< A[p] << std::endl;
} }
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl; std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0]; phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0); std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl; std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl; std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl; std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif #endif
// ForceHermitian(); // ForceHermitian();
// AssertHermitian(); // AssertHermitian();
// ForceDiagonal(); // ForceDiagonal();
}
void ForceHermitian(void) {
for(int d=0;d<4;d++){
int dd=d+1;
A[2*d] = adj(Cshift(A[2*d+1],dd,1));
} }
void ForceDiagonal(void) { // A[8] = 0.5*(A[8] + adj(A[8]));
}
void AssertHermitian(void) {
std::cout<<GridLogMessage<<"**************************************************"<<std::endl; CoarseMatrix AA (Grid());
std::cout<<GridLogMessage<<"**** Forcing coarse operator to be diagonal ****"<<std::endl; CoarseMatrix AAc (Grid());
std::cout<<GridLogMessage<<"**************************************************"<<std::endl; CoarseMatrix Diff (Grid());
for(int p=0;p<8;p++){ for(int d=0;d<4;d++){
A[p]=zero;
}
GridParallelRNG RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
Complex one(1.0);
iMatrix<CComplex,nbasis> ident; ident=one;
val = val*adj(val);
val = val + 1.0;
A[8] = val*ident;
// for(int s=0;s<Grid()->oSites();s++) {
// A[8]._odata[s]=val._odata[s];
// }
}
void ForceHermitian(void) {
for(int d=0;d<4;d++){
int dd=d+1;
A[2*d] = adj(Cshift(A[2*d+1],dd,1));
}
// A[8] = 0.5*(A[8] + adj(A[8]));
}
void AssertHermitian(void) {
CoarseMatrix AA (Grid());
CoarseMatrix AAc (Grid());
CoarseMatrix Diff (Grid());
for(int d=0;d<4;d++){
int dd=d+1; int dd=d+1;
AAc = Cshift(A[2*d+1],dd,1); AAc = Cshift(A[2*d+1],dd,1);
AA = A[2*d]; AA = A[2*d];
Diff = AA - adj(AAc); Diff = AA - adj(AAc);
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl; std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl; std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
}
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
} }
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
}
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,5 +1,5 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_FFT_H_ #ifndef _GRID_FFT_H_
#define _GRID_FFT_H_ #define _GRID_FFT_H_
@ -38,64 +38,64 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> { template<> struct FFTW<ComplexD> {
public: public:
typedef fftw_complex FFTW_scalar; typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan; typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed, FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, const int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas); ::fftw_flops(p,add,mul,fmas);
} }
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out); ::fftw_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p); ::fftw_destroy_plan(p);
} }
}; };
template<> struct FFTW<ComplexF> { template<> struct FFTW<ComplexF> {
public: public:
typedef fftwf_complex FFTW_scalar; typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan; typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed, FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, const int *onembed, FFTW_scalar *out, const int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas); ::fftwf_flops(p,add,mul,fmas);
} }
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out); ::fftwf_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p); ::fftwf_destroy_plan(p);
} }
}; };
#endif #endif
@ -104,203 +104,188 @@ namespace Grid {
#define FFTW_BACKWARD (+1) #define FFTW_BACKWARD (+1)
#endif #endif
class FFT { class FFT {
private: private:
GridCartesian *vgrid; GridCartesian *vgrid;
GridCartesian *sgrid; GridCartesian *sgrid;
int Nd; int Nd;
double flops; double flops;
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
std::vector<int> dimensions; Coordinate dimensions;
std::vector<int> processors; Coordinate processors;
std::vector<int> processor_coor; Coordinate processor_coor;
public: public:
static const int forward=FFTW_FORWARD; static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD; static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;} double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;} double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;} double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) : FFT ( GridCartesian * grid ) :
vgrid(grid), vgrid(grid),
Nd(grid->_ndimension), Nd(grid->_ndimension),
dimensions(grid->_fdimensions), dimensions(grid->_fdimensions),
processors(grid->_processors), processors(grid->_processors),
processor_coor(grid->_processor_coor) processor_coor(grid->_processor_coor)
{ {
flops=0; flops=0;
usec =0; usec =0;
std::vector<int> layout(Nd,1); Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors); sgrid = new GridCartesian(dimensions,layout,processors);
};
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
std::vector<int> mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
}
}
if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L);
}
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
PARALLEL_REGION
{
std::vector<int> cbuf(Nd);
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
}
}
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
PARALLEL_REGION
{
std::vector<int> clbuf(Nd), cgbuf(Nd);
sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
}
}
result = result*div;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
}; };
}
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
Coordinate layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View();
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
});
if (p != processors[dim] - 1) {
result = Cshift(result,dim,L);
}
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
thread_for( idx,NN,{
Coordinate cbuf(Nd);
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
});
result = result*div;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,153 +24,152 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALGORITHM_LINEAR_OP_H #pragma once
#define GRID_ALGORITHM_LINEAR_OP_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// LinearOperators Take a something and return a something. // LinearOperators Take a something and return a something.
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// //
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real): // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian Conjugateugate (transpose if real):
//SBase //SBase
// i) F(a x + b y) = aF(x) + b F(y). // i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast // ii) <x|Op|y> = <y|AdjOp|x>^\ast
// //
// Would be fun to have a test linearity & Herm Conj function! // Would be fun to have a test linearity & Herm Conj function!
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase { template<class Field> class LinearOperatorBase {
public: public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0; virtual void HermOp(const Field &in, Field &out)=0;
}; };
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code // By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
// between RB and non-RB variants. Sparse matrix is like the fermion action def, and then // between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
// the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising // the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
// replication of code. // replication of code.
// //
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm // I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases // while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to // with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes // do it, but I fear it required multiple inheritance and mixed in abstract base classes
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Construct herm op from non-herm matrix // Construct herm op from non-herm matrix
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> template<class Matrix,class Field>
class MdagMLinearOperator : public LinearOperatorBase<Field> { class MdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
public: public:
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){}; MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
} }
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); _Mat.MdagM(in,out,n1,n2);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
RealD n1,n2; RealD n1,n2;
HermOpAndNorm(in,out,n1,n2); HermOpAndNorm(in,out,n1,n2);
} }
}; };
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother // Construct herm op and shift it for mgrid smoother
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> template<class Matrix,class Field>
class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> { class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
RealD _shift; RealD _shift;
public: public:
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){}; ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
assert(0); assert(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
assert(0); assert(0);
} }
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
assert(0); assert(0);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
assert(0); assert(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); _Mat.MdagM(in,out,n1,n2);
out = out + _shift*in; out = out + _shift*in;
ComplexD dot; ComplexD dot;
dot= innerProduct(in,out); dot= innerProduct(in,out);
n1=real(dot); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
RealD n1,n2; RealD n1,n2;
HermOpAndNorm(in,out,n1,n2); HermOpAndNorm(in,out,n1,n2);
} }
}; };
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Wrap an already herm matrix // Wrap an already herm matrix
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> template<class Matrix,class Field>
class HermitianLinearOperator : public LinearOperatorBase<Field> { class HermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
public: public:
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){}; HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
} }
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out); _Mat.M(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot); ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
}; };
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several // Even Odd Schur decomp operators; there are several
@ -183,13 +182,13 @@ namespace Grid {
virtual RealD Mpc (const Field &in, Field &out) =0; virtual RealD Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0; virtual RealD MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp(in._grid); Field tmp(in.Grid());
tmp.checkerboard = in.checkerboard; tmp.Checkerboard() = in.Checkerboard();
ni=Mpc(in,tmp); ni=Mpc(in,tmp);
no=MpcDag(tmp,out); no=MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.checkerboard = in.checkerboard; out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out,n1,n2); MpcDagMpc(in,out,n1,n2);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
@ -216,20 +215,20 @@ namespace Grid {
Matrix &_Mat; Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); Field tmp(in.Grid());
tmp.checkerboard = !in.checkerboard; tmp.Checkerboard() = !in.Checkerboard();
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl; //std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp); _Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out); _Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.checkerboard << " cb out " << out.checkerboard << std::endl; //std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
_Mat.Mooee(in,out); _Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out); return axpy_norm(out,-1.0,tmp,out);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp); _Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out); _Mat.MooeeInvDag(tmp,out);
@ -247,7 +246,7 @@ namespace Grid {
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp); _Mat.MooeeInv(out,tmp);
@ -257,7 +256,7 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in); return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out); _Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
@ -275,7 +274,7 @@ namespace Grid {
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MooeeInv(in,out); _Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
@ -285,7 +284,7 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in); return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MeooeDag(in,out); _Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp); _Mat.MooeeInvDag(out,tmp);
@ -315,7 +314,7 @@ namespace Grid {
double tMeo; double tMeo;
double taxpby_norm; double taxpby_norm;
uint64_t ncall; uint64_t ncall;
public: public:
void Report(void) void Report(void)
{ {
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl; std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
@ -333,17 +332,17 @@ namespace Grid {
taxpby_norm=0; taxpby_norm=0;
ncall=0; ncall=0;
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ncall++; ncall++;
tMpc-=usecond(); tMpc-=usecond();
n2 = Mpc(in,out); n2 = Mpc(in,out);
tMpc+=usecond(); tMpc+=usecond();
tIP-=usecond(); tIP-=usecond();
ComplexD dot= innerProduct(in,out); ComplexD dot= innerProduct(in,out);
tIP+=usecond(); tIP+=usecond();
n1 = real(dot); n1 = real(dot);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
ncall++; ncall++;
tMpc-=usecond(); tMpc-=usecond();
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
@ -352,135 +351,145 @@ namespace Grid {
taxpby_norm-=usecond(); taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in); axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond(); taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out)
tMeo-=usecond(); {
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp); Field tmp(in.Grid());
tMeo+=usecond(); Field tmp2(in.Grid());
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in); // std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
taxpby_norm+=usecond(); _Mat.Mooee(in,out);
return nn; _Mat.Mooee(out,tmp);
} // std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
virtual RealD MpcDag (const Field &in, Field &out){
return Mpc(in,out); tMeo-=usecond();
} _Mat.Meooe(in,out);
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { _Mat.Meooe(out,tmp);
assert(0);// Never need with staggered tMeo+=usecond();
} taxpby_norm-=usecond();
}; RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; taxpby_norm+=usecond();
return nn;
}
virtual RealD MpcDag (const Field &in, Field &out){
return Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered
}
};
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for functions of operators // Base classes for functions of operators
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template<class Field> class OperatorFunction { template<class Field> class OperatorFunction {
public: public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0; virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) { virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size()); assert(in.size()==out.size());
for(int k=0;k<in.size();k++){ for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]); (*this)(Linop,in[k],out[k]);
} }
};
};
template<class Field> class LinearFunction {
public:
virtual void operator() (const Field &in, Field &out) = 0;
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
public:
void operator() (const Field &in, Field &out){
out = in;
};
};
/////////////////////////////////////////////////////////////
// Base classes for Multishift solvers for operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorMultiFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
};
// FIXME : To think about
// Chroma functionality list defining LinearOperator
/*
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
virtual const Subset& subset() const = 0;
virtual unsigned long nFlops() const { return 0; }
virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
const Subset& subset() const {return all;}
};
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Hermitian operator Linear function and operator function
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
{}
void operator()(const Field& in, Field& out) {
_Linop.HermOp(in,out);
}
};
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
: _poly(poly), _Linop(linop) {};
void operator()(const Field& in, Field& out) {
_poly(_Linop,in,out);
}
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in._grid);
Field Mtmp(in._grid);
AtoN = in;
out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
}
};
}; };
};
} template<class Field> class LinearFunction {
public:
virtual void operator() (const Field &in, Field &out) = 0;
};
#endif template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
public:
void operator() (const Field &in, Field &out){
out = in;
};
};
/////////////////////////////////////////////////////////////
// Base classes for Multishift solvers for operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorMultiFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
};
// FIXME : To think about
// Chroma functionality list defining LinearOperator
/*
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
virtual const Subset& subset() const = 0;
virtual unsigned long nFlops() const { return 0; }
virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
const Subset& subset() const {return all;}
};
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Hermitian operator Linear function and operator function
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
{}
void operator()(const Field& in, Field& out) {
_Linop.HermOp(in,out);
}
};
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
: _poly(poly), _Linop(linop) {};
void operator()(const Field& in, Field& out) {
_poly(_Linop,in,out);
}
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
using OperatorFunction<Field>::operator();
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in.Grid());
Field Mtmp(in.Grid());
AtoN = in;
out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
}
};
};
NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,24 +23,24 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_PRECONDITIONER_H #ifndef GRID_PRECONDITIONER_H
#define GRID_PRECONDITIONER_H #define GRID_PRECONDITIONER_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class Field> class Preconditioner : public LinearFunction<Field> { template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0; virtual void operator()(const Field &src, Field & psi)=0;
}; };
template<class Field> class TrivialPrecon : public Preconditioner<Field> { template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public: public:
void operator()(const Field &src, Field & psi){ void operator()(const Field &src, Field & psi){
psi = src; psi = src;
} }
TrivialPrecon(void){}; TrivialPrecon(void){};
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,38 +23,38 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALGORITHM_SPARSE_MATRIX_H #ifndef GRID_ALGORITHM_SPARSE_MATRIX_H
#define GRID_ALGORITHM_SPARSE_MATRIX_H #define GRID_ALGORITHM_SPARSE_MATRIX_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action // Interface defining what I expect of a general sparse matrix, such as a Fermion action
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SparseMatrixBase { template<class Field> class SparseMatrixBase {
public: public:
virtual GridBase *Grid(void) =0; virtual GridBase *Grid(void) =0;
// Full checkerboar operations // Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0; virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0; virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in._grid); Field tmp (in.Grid());
ni=M(in,tmp); ni=M(in,tmp);
no=Mdag(tmp,out); no=Mdag(tmp,out);
} }
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
}; };
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// Interface augmented by a red black sparse matrix, such as a Fermion action // Interface augmented by a red black sparse matrix, such as a Fermion action
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> { template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public: public:
virtual GridBase *RedBlackGrid(void)=0; virtual GridBase *RedBlackGrid(void)=0;
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Query the even even properties to make algorithmic decisions // Query the even even properties to make algorithmic decisions
@ -63,17 +63,17 @@ namespace Grid {
virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden
virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0; virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0; virtual void Mooee (const Field &in, Field &out)=0;
virtual void MooeeInv (const Field &in, Field &out)=0; virtual void MooeeInv (const Field &in, Field &out)=0;
virtual void MeooeDag (const Field &in, Field &out)=0; virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0; virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0; virtual void MooeeInvDag (const Field &in, Field &out)=0;
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -25,14 +25,14 @@ Author: Christoph Lehner <clehner@bnl.gov>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CHEBYSHEV_H #ifndef GRID_CHEBYSHEV_H
#define GRID_CHEBYSHEV_H #define GRID_CHEBYSHEV_H
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
struct ChebyParams : Serializable { struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams, GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
@ -41,337 +41,339 @@ struct ChebyParams : Serializable {
int, Npoly); int, Npoly);
}; };
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations // Generic Chebyshev approximations
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> template<class Field>
class Chebyshev : public OperatorFunction<Field> { class Chebyshev : public OperatorFunction<Field> {
private: private:
std::vector<RealD> Coeffs; using OperatorFunction<Field>::operator();
int order;
RealD hi;
RealD lo;
public: std::vector<RealD> Coeffs;
void csv(std::ostream &out){ int order;
RealD diff = hi-lo; RealD hi;
RealD delta = (hi-lo)*1.0e-9; RealD lo;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1; public:
RealD f = approx(x); void csv(std::ostream &out){
out<< x<<" "<<f<<std::endl; RealD diff = hi-lo;
} RealD delta = diff*1.0e-9;
return; for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
} }
return;
}
// Convenience for plotting the approximation // Convenience for plotting the approximation
void PlotApprox(std::ostream &out) { void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl; out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){ for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl; out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
////////////////////////////////////////////////////////////////////////////////////////////////////
// CJ: the one we need for Lanczos
void Init(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
// std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
T1=y*xscale+in*mscale;
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
} }
}; };
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
template<class Field> ////////////////////////////////////////////////////////////////////////////////////////////////////
class ChebyshevLanczos : public Chebyshev<Field> { // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
private: ////////////////////////////////////////////////////////////////////////////////////////////////////
std::vector<RealD> Coeffs; // CJ: the one we need for Lanczos
int order; void Init(RealD _lo,RealD _hi,int _order)
RealD alpha; {
RealD beta; lo=_lo;
RealD mu; hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
};
public: void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) : {
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
T1=y*xscale+in*mscale;
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
alpha(_alpha), alpha(_alpha),
beta(_beta), beta(_beta),
mu(_mu) mu(_mu)
{ {
order=_order; order=_order;
Coeffs.resize(order); Coeffs.resize(order);
for(int i=0;i<_order;i++){ for(int i=0;i<_order;i++){
Coeffs[i] = 0.0; Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in._grid;
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
} }
Coeffs[order-1]=1.0;
}; };
}
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in.Grid();
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -26,127 +26,127 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef INCLUDED_FORECAST_H #ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H #define INCLUDED_FORECAST_H
namespace Grid { NAMESPACE_BEGIN(Grid);
// Abstract base class. // Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi) // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi). // and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field> template<class Matrix, class Field>
class Forecast class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{ {
public: int degree = prev_solns.size();
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0; Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = Zero(); return chi; }
else if(degree == 1){ return prev_solns[0]; }
// RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = conjugate(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = Zero();
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = conjugate(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
}; };
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012), NAMESPACE_END(Grid);
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = zero; return chi; }
else if(degree == 1){ return prev_solns[0]; }
RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = std::conj(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = zero;
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = std::conj(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
}
#endif #endif

View File

@ -27,7 +27,8 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
double MultiShiftFunction::approx(double x) double MultiShiftFunction::approx(double x)
{ {
double a = norm; double a = norm;
@ -53,4 +54,4 @@ void MultiShiftFunction::csv(std::ostream &out)
} }
return; return;
} }
} NAMESPACE_END(Grid);

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef MULTI_SHIFT_FUNCTION #ifndef MULTI_SHIFT_FUNCTION
#define MULTI_SHIFT_FUNCTION #define MULTI_SHIFT_FUNCTION
namespace Grid { NAMESPACE_BEGIN(Grid);
class MultiShiftFunction { class MultiShiftFunction {
public: public:
@ -63,5 +63,5 @@ public:
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) {
// Search for error maxima and minima // Search for error maxima and minima
void AlgRemez::search(bigfloat *step) { void AlgRemez::search(bigfloat *step) {
bigfloat a, q, xm, ym, xn, yn, xx0, xx1; bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
int i, j, meq, emsign, ensign, steps; int i, meq, emsign, ensign, steps;
meq = neq + 1; meq = neq + 1;
bigfloat *yy = new bigfloat[meq]; bigfloat *yy = new bigfloat[meq];
@ -306,7 +306,6 @@ void AlgRemez::search(bigfloat *step) {
bigfloat eclose = 1.0e30; bigfloat eclose = 1.0e30;
bigfloat farther = 0l; bigfloat farther = 0l;
j = 1;
xx0 = apstrt; xx0 = apstrt;
for (i = 0; i < meq; i++) { for (i = 0; i < meq; i++) {

View File

@ -58,8 +58,8 @@
/* Compute the partial fraction expansion coefficients (alpha) from the /* Compute the partial fraction expansion coefficients (alpha) from the
* factored form */ * factored form */
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace Approx { NAMESPACE_BEGIN(Approx);
static void construct_partfrac(izd *z) { static void construct_partfrac(izd *z) {
int dn = z -> dn, dd = z -> dd, type = z -> type; int dn = z -> dn, dd = z -> dd, type = z -> type;
@ -516,7 +516,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
free(d); free(d);
return zd; return zd;
} }
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#ifdef TEST #ifdef TEST
@ -585,6 +587,7 @@ static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
return (ONE - T) / (ONE + T); return (ONE - T) / (ONE + T);
} }
/* Test program. Apart from printing out the parameters for R(x) it produces /* Test program. Apart from printing out the parameters for R(x) it produces
* the following data files for plotting (unless NPLOT is defined): * the following data files for plotting (unless NPLOT is defined):
* *
@ -723,5 +726,5 @@ int main(int argc, char** argv) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
#endif /* TEST */ #endif /* TEST */

View File

@ -1,13 +1,13 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */ /* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
#ifdef __cplusplus #ifdef __cplusplus
namespace Grid { #include <Grid/Namespace.h>
namespace Approx { NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
#endif #endif
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL #ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION #ifndef PRECISION
#define PRECISION double #define PRECISION double
@ -83,5 +83,6 @@ void zolotarev_free(zolotarev_data *zdata);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
}} NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif #endif

View File

@ -10,10 +10,12 @@
#ifndef INCLUDED_BIGFLOAT_H #ifndef INCLUDED_BIGFLOAT_H
#define INCLUDED_BIGFLOAT_H #define INCLUDED_BIGFLOAT_H
#define __GMP_WITHIN_CONFIGURE
#include <gmp.h> #include <gmp.h>
#include <mpf2mpfr.h> #include <mpf2mpfr.h>
#include <mpfr.h> #include <mpfr.h>
#undef __GMP_WITHIN_CONFIGURE
class bigfloat { class bigfloat {
private: private:

View File

@ -90,8 +90,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
grid = src._grid; grid = src.Grid();
RealD f; RealD f;
RealD rtzp,rtz,a,d,b; RealD rtzp,rtz,a,d,b;

View File

@ -27,11 +27,9 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H #pragma once
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
NAMESPACE_BEGIN(Grid);
namespace Grid {
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
@ -154,12 +152,12 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{ {
int Orthog = blockDim; // First dimension is block dim; this is an assumption int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B._grid->_fdimensions[Orthog]; Nblock = B.Grid()->_fdimensions[Orthog];
/* FAKE */ /* FAKE */
Nblock=8; Nblock=8;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.checkerboard = B.checkerboard; X.Checkerboard() = B.Checkerboard();
conformable(X, B); conformable(X, B);
Field tmp(B); Field tmp(B);
@ -334,11 +332,11 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{ {
int Orthog = blockDim; // First dimension is block dim int Orthog = blockDim; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog]; Nblock = Src.Grid()->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard; Psi.Checkerboard() = Src.Checkerboard();
conformable(Psi, Src); conformable(Psi, Src);
Field P(Src); Field P(Src);
@ -478,7 +476,7 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
tmp[b] = Y[b]; tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) { for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
} }
} }
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
@ -488,9 +486,9 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){ void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for // Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
AP[b] = zero; AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) { for(int bp=0;bp<Nblock;bp++) {
AP[b] += (m(bp,b))*X[bp]; AP[b] += scomplex(m(bp,b))*X[bp];
} }
} }
} }
@ -517,7 +515,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
X[b].checkerboard = B[b].checkerboard; X[b].Checkerboard() = B[b].Checkerboard();
conformable(X[b], B[b]); conformable(X[b], B[b]);
conformable(X[b], X[0]); conformable(X[b], X[0]);
} }
@ -690,9 +688,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
} NAMESPACE_END(Grid);
#endif

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> { class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true // defaults to true
@ -52,10 +54,10 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
CommunicationAvoidingGeneralisedMinimalResidual(RealD tol, CommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit, Integer maxit,
@ -76,7 +78,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl; std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -86,7 +88,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
@ -142,11 +144,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart // this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -157,7 +159,9 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
gamma[0] = sqrt(norm2(r)); gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r; ComplexD scale = 1.0/gamma[0];
v[0] = scale * r;
LinalgTimer.Stop(); LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) { for (int i=0; i<RestartLength; i++) {
@ -168,7 +172,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -194,11 +198,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -206,13 +210,13 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -221,7 +225,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -231,8 +235,8 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -27,11 +27,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_H #ifndef GRID_CONJUGATE_GRADIENT_H
#define GRID_CONJUGATE_GRADIENT_H #define GRID_CONJUGATE_GRADIENT_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@ -40,7 +40,10 @@ namespace Grid {
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
@ -48,17 +51,18 @@ class ConjugateGradient : public OperatorFunction<Field> {
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), : Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){}; ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
psi.checkerboard = src.checkerboard;
conformable(psi, src); conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred; RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
Field p(src); Field p(src);
Field mmp(src); Field mmp(src);
@ -70,7 +74,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, b); Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
p = r; p = r;
@ -127,10 +131,13 @@ class ConjugateGradient : public OperatorFunction<Field> {
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
parallel_for(int ss=0;ss<src._grid->oSites();ss++){ auto psi_v = psi.View();
vstream(psi[ss], a * p[ss] + psi[ss]); auto p_v = p.View();
vstream(p [ss], b * p[ss] + r[ss]); auto r_v = r.View();
} accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@ -143,12 +150,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, qq); Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src; p = mmp - src;
RealD srcnorm = sqrt(norm2(src)); RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p)); RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
@ -174,5 +181,5 @@ class ConjugateGradient : public OperatorFunction<Field> {
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,13 +23,12 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
namespace Grid { NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG //Mixed precision restarted defect correction CG
template<class FieldD,class FieldF, template<class FieldD,class FieldF,
@ -67,98 +66,96 @@ namespace Grid {
guesser = &g; guesser = &g;
} }
void operator() (const FieldD &src_d_in, FieldD &sol_d){ void operator() (const FieldD &src_d_in, FieldD &sol_d){
TotalInnerIterations = 0; TotalInnerIterations = 0;
GridStopWatch TotalTimer; GridStopWatch TotalTimer;
TotalTimer.Start(); TotalTimer.Start();
int cb = src_d_in.checkerboard; int cb = src_d_in.Checkerboard();
sol_d.checkerboard = cb; sol_d.Checkerboard() = cb;
RealD src_norm = norm2(src_d_in); RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance; RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in._grid; GridBase* DoublePrecGrid = src_d_in.Grid();
FieldD tmp_d(DoublePrecGrid); FieldD tmp_d(DoublePrecGrid);
tmp_d.checkerboard = cb; tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid); FieldD tmp2_d(DoublePrecGrid);
tmp2_d.checkerboard = cb; tmp2_d.Checkerboard() = cb;
FieldD src_d(DoublePrecGrid); FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = InnerTolerance; RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid); FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb; src_f.Checkerboard() = cb;
FieldF sol_f(SinglePrecGrid); FieldF sol_f(SinglePrecGrid);
sol_f.checkerboard = cb; sol_f.Checkerboard() = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false; CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer; GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer; GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
zeroit(sol_f);
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
} }
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TotalTimer.Stop(); PrecChangeTimer.Start();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; precisionChange(src_f, src_d);
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; PrecChangeTimer.Stop();
sol_f = Zero();
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
} }
};
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,147 +24,150 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H #ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H #define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template<class Field> template<class Field>
class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>, class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field> public OperatorFunction<Field>
{ {
public: public:
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) : using OperatorFunction<Field>::operator();
MaxIterations(maxit),
shifts(_shifts) RealD Tolerance;
{ Integer MaxIterations;
verbose=1; Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
verbose=1;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
} }
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) return;
{
GridBase *grid = src._grid;
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
} }
return; void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
} {
GridBase *grid = src.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi) std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
{ std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
GridBase *grid = src._grid; std::vector<Field> ps(nshift,grid);// Search directions
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts" assert(psi.size()==nshift);
std::vector<RealD> &mresidual(shifts.tolerances); assert(mass.size()==nshift);
std::vector<RealD> alpha(nshift,1.0); assert(mresidual.size()==nshift);
std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift]; RealD bs[nshift];
RealD rsq[nshift]; RealD rsq[nshift];
RealD z[nshift][2]; RealD z[nshift][2];
int converged[nshift]; int converged[nshift];
const int primary =0; const int primary =0;
//Primary shift fields CG iteration //Primary shift fields CG iteration
RealD a,b,c,d; RealD a,b,c,d;
RealD cp,bp,qq; //prev RealD cp,bp,qq; //prev
// Matrix mult fields // Matrix mult fields
Field r(grid); Field r(grid);
Field p(grid); Field p(grid);
Field tmp(grid); Field tmp(grid);
Field mmp(grid); Field mmp(grid);
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] ); assert( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
// Wire guess to zero // Wire guess to zero
// Residuals "r" are src // Residuals "r" are src
// First search direction "p" is also src // First search direction "p" is also src
cp = norm2(src); cp = norm2(src);
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s]; rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl; <<" target resid "<<rsq[s]<<std::endl;
ps[s] = src; ps[s] = src;
} }
// r and p for primary // r and p for primary
r=src; r=src;
p=src; p=src;
//MdagM+m[0] //MdagM+m[0]
Linop.HermOpAndNorm(p,mmp,d,qq); Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp); axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p); RealD rn = norm2(p);
d += rn*mass[0]; d += rn*mass[0];
// have verified that inner product of // have verified that inner product of
// p and mmp is equal to d after this since // p and mmp is equal to d after this since
// the d computation is tricky // the d computation is tricky
// qq = real(innerProduct(p,mmp)); // qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl; // std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
b = -cp /d; b = -cp /d;
// Set up the various shift variables // Set up the various shift variables
int iz=0; int iz=0;
z[0][1-iz] = 1.0; z[0][1-iz] = 1.0;
z[0][iz] = 1.0; z[0][iz] = 1.0;
bs[0] = b; bs[0] = b;
for(int s=1;s<nshift;s++){ for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0; z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0])); z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz]; bs[s] = b*z[s][iz];
} }
// r += b[0] A.p[0] // r += b[0] A.p[0]
// c= norm(r) // c= norm(r)
c=axpy_norm(r,b,mmp,r); c=axpy_norm(r,b,mmp,r);
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
/////////////////////////////////////// ///////////////////////////////////////
// Timers // Timers
/////////////////////////////////////// ///////////////////////////////////////
@ -175,37 +178,37 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
// Iteration loop // Iteration loop
int k; int k;
for (k=1;k<=MaxIterations;k++){ for (k=1;k<=MaxIterations;k++){
a = c /cp; a = c /cp;
AXPYTimer.Start(); AXPYTimer.Start();
axpy(p,a,p,r); axpy(p,a,p,r);
AXPYTimer.Stop(); AXPYTimer.Stop();
// Note to self - direction ps is iterated seperately // Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope // for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case. // for avoiding linear algebra in "single" case.
// //
// However SAME r is used. Could load "r" and update // However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving // ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps // New Kernel: Load r, vector of coeffs, vector of pointers ps
AXPYTimer.Start(); AXPYTimer.Start();
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
if ( ! converged[s] ) { if ( ! converged[s] ) {
if (s==0){ if (s==0){
axpy(ps[s],a,ps[s],r); axpy(ps[s],a,ps[s],r);
} else{ } else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b); RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]); axpby(ps[s],z[s][iz],as,r,ps[s]);
}
} }
} }
}
AXPYTimer.Stop(); AXPYTimer.Stop();
cp=c; cp=c;
MatrixTimer.Start(); MatrixTimer.Start();
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
// The below is faster on KNL // The below is faster on KNL
@ -215,89 +218,89 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
MatrixTimer.Stop(); MatrixTimer.Stop();
AXPYTimer.Start(); AXPYTimer.Start();
axpy(mmp,mass[0],p,mmp); axpy(mmp,mass[0],p,mmp);
AXPYTimer.Stop(); AXPYTimer.Stop();
RealD rn = norm2(p); RealD rn = norm2(p);
d += rn*mass[0]; d += rn*mass[0];
bp=b; bp=b;
b=-cp/d; b=-cp/d;
AXPYTimer.Start(); AXPYTimer.Start();
c=axpy_norm(r,b,mmp,r); c=axpy_norm(r,b,mmp,r);
AXPYTimer.Stop(); AXPYTimer.Stop();
// Toggle the recurrence history // Toggle the recurrence history
bs[0] = b; bs[0] = b;
iz = 1-iz; iz = 1-iz;
ShiftTimer.Start(); ShiftTimer.Start();
for(int s=1;s<nshift;s++){ for(int s=1;s<nshift;s++){
if((!converged[s])){ if((!converged[s])){
RealD z0 = z[s][1-iz]; RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz]; RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b)); / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
} }
}
ShiftTimer.Stop(); ShiftTimer.Stop();
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
int ss = s; int ss = s;
// Scope for optimisation here in case of "single". // Scope for optimisation here in case of "single".
// Could load psi[0] and pull all ps[s] in. // Could load psi[0] and pull all ps[s] in.
// if ( single ) ss=primary; // if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain: // Pipelined CG gain:
// //
// New Kernel: Load r, vector of coeffs, vector of pointers ps // New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps // New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce // If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s]. // on ps[s].
// Before: 3 x npole + 3 x npole // Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG. // After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
if( (!converged[s]) ) { if( (!converged[s]) ) {
axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]); axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
}
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
} }
} }
}
if ( all_converged ){ // Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop(); SolverTimer.Stop();
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
// Check answers // Check answers
for(int s=0; s < nshift; s++) { for(int s=0; s < nshift; s++) {
Linop.HermOpAndNorm(psi[s],mmp,d,qq); Linop.HermOpAndNorm(psi[s],mmp,d,qq);
axpy(tmp,mass[s],psi[s],mmp); axpy(tmp,mass[s],psi[s],mmp);
axpy(r,-alpha[s],src,tmp); axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r); RealD rn = norm2(r);
RealD cn = norm2(src); RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
} }
std::cout << GridLogMessage << "Time Breakdown "<<std::endl; std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@ -307,16 +310,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
IterationsToComplete = k; IterationsToComplete = k;
return; return;
} }
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
} }
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,234 +23,236 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H #ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H #define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> template<class FieldD,class FieldF,
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
public: typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
// Defaults true. public:
RealD Tolerance; bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
Integer MaxIterations; // Defaults true.
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion RealD Tolerance;
Integer ReliableUpdatesPerformed; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
bool DoFinalCleanup; //Final DP cleanup, defaults to true bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations Integer IterationsToCleanup; //Final DP cleanup step iterations
LinearOperatorBase<FieldF> &Linop_f; LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d; LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid; GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter RealD Delta; //reliable update parameter
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback; LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol; RealD fallback_transition_tol;
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true) ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol), : Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
Delta(_delta), Delta(_delta),
Linop_f(_Linop_f), Linop_f(_Linop_f),
Linop_d(_Linop_d), Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid), SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv), ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true), DoFinalCleanup(true),
Linop_fallback(NULL) Linop_fallback(NULL)
{}; {};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback; Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol; fallback_transition_tol = _fallback_transition_tol;
} }
void operator()(const FieldD &src, FieldD &psi) { void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f; LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false; bool using_fallback = false;
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred; RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src); FieldD p(src);
FieldD mmp(src); FieldD mmp(src);
FieldD r(src); FieldD r(src);
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b); Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
p = r; p = r;
a = norm2(p); a = norm2(p);
cp = a; cp = a;
ssq = norm2(src); ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl; std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl; std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl; std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl; std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl; std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl; std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :) // Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
return;
}
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r);
FieldF psi_f(r_f);
psi_f = Zero();
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n"; //Although not written in the paper, I assume that I have to add on the final solution
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return; return;
} }
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
//Single prec initialization Linop_d.HermOpAndNorm(psi, mmp, d, qq);
FieldF r_f(SinglePrecGrid); r = src - mmp;
r_f.checkerboard = r.checkerboard;
precisionChange(r_f, r);
FieldF psi_f(r_f); psi_f = Zero();
psi_f = zero; precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
FieldF p_f(r_f); b = cp/c;
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){ l = l+1;
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
psi_f = zero;
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
} }
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl; p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
ReliableUpdatesPerformed = l; ReliableUpdatesPerformed = l;
} }
};
}; };
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,88 +24,90 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CONJUGATE_RESIDUAL_H #ifndef GRID_CONJUGATE_RESIDUAL_H
#define GRID_CONJUGATE_RESIDUAL_H #define GRID_CONJUGATE_RESIDUAL_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template<class Field> template<class Field>
class ConjugateResidual : public OperatorFunction<Field> { class ConjugateResidual : public OperatorFunction<Field> {
public: public:
RealD Tolerance; using OperatorFunction<Field>::operator();
Integer MaxIterations;
int verbose;
ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { RealD Tolerance;
verbose=0; Integer MaxIterations;
}; int verbose;
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {
verbose=0;
};
RealD a, b, c, d; void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD cp, ssq,rsq;
RealD a, b; // c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp; RealD rAr, rAAr, rArp;
RealD pAp, pAAp; RealD pAp, pAAp;
GridBase *grid = src._grid; GridBase *grid = src.Grid();
psi=zero; psi=Zero();
Field r(grid), p(grid), Ap(grid), Ar(grid); Field r(grid), p(grid), Ap(grid), Ar(grid);
r=src; r=src;
p=src; p=src;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr); Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r); b =rAr/rArp;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
axpy(p,b,p,r); axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar); pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl; if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<std::sqrt(cp/ssq)
<< " true residual "<<std::sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
} }
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
} }
};
} std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -33,7 +33,7 @@ namespace Grid {
template<class Field> template<class Field>
class ZeroGuesser: public LinearFunction<Field> { class ZeroGuesser: public LinearFunction<Field> {
public: public:
virtual void operator()(const Field &src, Field &guess) { guess = zero; }; virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
}; };
template<class Field> template<class Field>
class DoNothingGuesser: public LinearFunction<Field> { class DoNothingGuesser: public LinearFunction<Field> {
@ -60,14 +60,14 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
guess = zero; guess = Zero();
assert(evec.size()==eval.size()); assert(evec.size()==eval.size());
auto N = evec.size(); auto N = evec.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const Field& tmp = evec[i]; const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
} }
guess.checkerboard = src.checkerboard; guess.Checkerboard() = src.Checkerboard();
} }
}; };
@ -90,15 +90,15 @@ public:
void operator()(const FineField &src,FineField &guess) { void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size(); int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid); CoarseField src_coarse(evec_coarse[0].Grid());
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero; CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero();
blockProject(src_coarse,src,subspace); blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i]; const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse); axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
} }
blockPromote(guess_coarse,guess,subspace); blockPromote(guess_coarse,guess,subspace);
guess.checkerboard = src.checkerboard; guess.Checkerboard() = src.Checkerboard();
}; };
}; };

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> { class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true // defaults to true
@ -53,10 +55,10 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner; LinearFunction<Field> &Preconditioner;
@ -81,7 +83,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl; std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -91,7 +93,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
@ -149,12 +151,12 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart // these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -176,7 +178,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -206,11 +208,11 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -218,13 +220,13 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -233,7 +235,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -243,8 +245,8 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true // defaults to true
@ -53,10 +55,10 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner; LinearFunction<Field> &Preconditioner;
@ -79,7 +81,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -89,7 +91,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
@ -147,12 +149,12 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart // these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -174,7 +176,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -204,11 +206,11 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -216,13 +218,13 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -231,7 +233,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -241,8 +243,8 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class GeneralisedMinimalResidual : public OperatorFunction<Field> { class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true // defaults to true
@ -52,10 +54,10 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
GeneralisedMinimalResidual(RealD tol, GeneralisedMinimalResidual(RealD tol,
Integer maxit, Integer maxit,
@ -74,7 +76,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -84,7 +86,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
@ -140,11 +142,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart // this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -166,7 +168,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -192,11 +194,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -204,13 +206,13 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -219,7 +221,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -229,8 +231,8 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -35,7 +35,7 @@ Author: Christoph Lehner <clehner@bnl.gov>
//#include <zlib.h> //#include <zlib.h>
#include <sys/stat.h> #include <sys/stat.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h // Move following 100 LOC to lattice/Lattice_basis.h
@ -52,26 +52,31 @@ void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
template<class Field> template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
std::vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid; GridBase* grid = basis[0].Grid();
parallel_region for(int k=0;k<basis.size();k++){
{ basis_v[k] = basis[k].View();
}
thread_region
{
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.; for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){ for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis[k]._odata[ss]; B[j] +=Qt(j,k) * basis_v[k][ss];
} }
} }
for(int j=j0; j<j1; ++j){ for(int j=j0; j<j1; ++j){
basis[j]._odata[ss] = B[j]; basis_v[j][ss] = B[j];
} }
} });
} }
} }
@ -80,16 +85,18 @@ template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{ {
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid; GridBase* grid = basis[0].Grid();
result.checkerboard = basis[0].checkerboard; result.Checkerboard() = basis[0].Checkerboard();
parallel_for(int ss=0;ss < grid->oSites();ss++){ auto result_v=result.View();
vobj B = zero; thread_for(ss, grid->oSites(),{
vobj B = Zero();
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B +=Qt(j,k) * basis[k]._odata[ss]; auto basis_k = basis[k].View();
B +=Qt(j,k) * basis_k[ss];
} }
result._odata[ss] = B; result_v[ss] = B;
} });
} }
template<class Field> template<class Field>
@ -119,7 +126,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i); assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]); std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i]; idx[j] = idx[i];
@ -150,6 +157,19 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
basisReorderInPlace(_v,sort_vals,idx); basisReorderInPlace(_v,sort_vals,idx);
} }
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted lanczos // Implicitly restarted lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@ -289,7 +309,7 @@ public:
template<typename T> static RealD normalise(T& v) template<typename T> static RealD normalise(T& v)
{ {
RealD nn = norm2(v); RealD nn = norm2(v);
nn = sqrt(nn); nn = std::sqrt(nn);
v = v * (1.0/nn); v = v * (1.0/nn);
return nn; return nn;
} }
@ -321,8 +341,8 @@ until convergence
*/ */
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false) void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{ {
GridBase *grid = src._grid; GridBase *grid = src.Grid();
assert(grid == evec[0]._grid); assert(grid == evec[0].Grid());
GridLogIRL.TimingMode(1); GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@ -446,7 +466,7 @@ until convergence
assert(k2<Nm); assert(k2<Nm); assert(k1>0); assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl; std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Compressed vector f and beta(k2) // Compressed vector f and beta(k2)
@ -454,7 +474,7 @@ until convergence
f *= Qt(k2-1,Nm-1); f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2]; f += lme[k2-1] * evec[k2];
beta_k = norm2(f); beta_k = norm2(f);
beta_k = sqrt(beta_k); beta_k = std::sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl; std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k; RealD betar = 1.0/beta_k;
@ -477,7 +497,7 @@ until convergence
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl; std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.checkerboard = evec[0].checkerboard; Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
// power of two search pattern; not every evalue in eval2 is assessed. // power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1; int allconv =1;
@ -515,7 +535,7 @@ until convergence
converged: converged:
{ {
Field B(grid); B.checkerboard = evec[0].checkerboard; Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
basisRotate(evec,Qt,0,Nk,0,Nk,Nm); basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl; std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0; Nconv=0;
@ -807,7 +827,7 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
// determination of 2x2 leading submatrix // determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2]; RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]); RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub))); RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift) // (Dsh: shift)
@ -838,5 +858,6 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
abort(); abort();
} }
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,16 +24,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H #ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H #define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid { NAMESPACE_BEGIN(Grid);
struct LanczosParams : Serializable { struct LanczosParams : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
ChebyParams, Cheby,/*Chebyshev*/ ChebyParams, Cheby,/*Chebyshev*/
int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/ int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
@ -46,7 +45,7 @@ struct LanczosParams : Serializable {
}; };
struct LocalCoherenceLanczosParams : Serializable { struct LocalCoherenceLanczosParams : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs, bool, saveEvecs,
bool, doFine, bool, doFine,
@ -59,7 +58,7 @@ struct LocalCoherenceLanczosParams : Serializable {
RealD , coarse_relax_tol, RealD , coarse_relax_tol,
std::vector<int>, blockSize, std::vector<int>, blockSize,
std::string, config, std::string, config,
std::vector < std::complex<double> >, omega, std::vector < ComplexD >, omega,
RealD, mass, RealD, mass,
RealD, M5); RealD, M5);
}; };
@ -83,14 +82,14 @@ public:
}; };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid; GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].checkerboard; int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.checkerboard= checkerboard; FineField fin (FineGrid); fin.Checkerboard()= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard; FineField fout(FineGrid); fout.Checkerboard() = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl; blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl; _Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl; blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
} }
}; };
@ -117,12 +116,12 @@ public:
{ }; { };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard =checkerboard; GridBase *FineGrid = subspace[0].Grid();
FineField fout(FineGrid);fout.checkerboard =checkerboard; int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.Checkerboard() =checkerboard;
FineField fout(FineGrid);fout.Checkerboard() =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl; blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl; _poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
@ -133,7 +132,7 @@ public:
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > > class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
{ {
public: public:
typedef iVector<CComplex,nbasis > CoarseSiteVector; typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField; typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@ -142,7 +141,7 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
LinearFunction<CoarseField> & _Poly; LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother; OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop; LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol; RealD _coarse_relax_tol;
std::vector<FineField> &_subspace; std::vector<FineField> &_subspace;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly, ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
@ -182,10 +181,10 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
} }
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
GridBase *FineGrid = _subspace[0]._grid; GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].checkerboard; int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.checkerboard =checkerboard; FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard; FineField fv(FineGrid);fv.Checkerboard() =checkerboard;
blockPromote(B,fv,_subspace); blockPromote(B,fv,_subspace);
@ -305,11 +304,11 @@ public:
int Nk = nbasis; int Nk = nbasis;
subspace.resize(Nk,_FineGrid); subspace.resize(Nk,_FineGrid);
subspace[0]=1.0; subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard; subspace[0].Checkerboard()=_checkerboard;
normalise(subspace[0]); normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){ for(int k=1;k<Nk;k++){
subspace[k].checkerboard=_checkerboard; subspace[k].Checkerboard()=_checkerboard;
Op(subspace[k-1],subspace[k]); Op(subspace[k-1],subspace[k]);
normalise(subspace[k]); normalise(subspace[k]);
} }
@ -360,7 +359,11 @@ public:
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; FineField src(_FineGrid);
typedef typename FineField::scalar_type Scalar;
// src=1.0;
src=Scalar(1.0);
src.Checkerboard() = _checkerboard;
int Nconv; int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false); IRL.calc(evals_fine,subspace,src,Nconv,false);
@ -402,5 +405,5 @@ public:
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -33,6 +33,8 @@ namespace Grid {
template<class Field> class MinimalResidual : public OperatorFunction<Field> { template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge. bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
@ -46,11 +48,11 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
Complex a, c; ComplexD a, c;
Real d; RealD d;
Field Mr(src); Field Mr(src);
Field r(src); Field r(src);
@ -71,7 +73,6 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl; std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl;
std::cout << GridLogIterative << "MinimalResidual: mp " << d << std::endl;
std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl; std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl;
if (cp <= rsq) { if (cp <= rsq) {

View File

@ -34,6 +34,9 @@ namespace Grid {
template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> { class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
public: public:
using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true // defaults to true
@ -54,10 +57,10 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
GridBase* SinglePrecGrid; GridBase* SinglePrecGrid;
@ -84,7 +87,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) { void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -94,7 +97,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
FieldD r(src._grid); FieldD r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl; std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
@ -154,12 +157,12 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
RealD cp = 0; RealD cp = 0;
FieldD w(src._grid); FieldD w(src.Grid());
FieldD r(src._grid); FieldD r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart // these should probably be made class members so that they are only allocated once, not in every restart
std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<FieldD> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; std::vector<FieldD> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -181,7 +184,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -223,11 +226,11 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -235,13 +238,13 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -250,7 +253,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -260,8 +263,8 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,38 +23,38 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_NORMAL_EQUATIONS_H #ifndef GRID_NORMAL_EQUATIONS_H
#define GRID_NORMAL_EQUATIONS_H #define GRID_NORMAL_EQUATIONS_H
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver // Take a matrix and form an NE solver calling a Herm solver
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public OperatorFunction<Field>{ template<class Field> class NormalEquations : public OperatorFunction<Field>{
private: private:
SparseMatrixBase<Field> & _Matrix; SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;
public: public:
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Wrap the usual normal equations trick // Wrap the usual normal equations trick
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver) NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver) {}; : _Matrix(Matrix), _HermitianSolver(HermitianSolver) {};
void operator() (const Field &in, Field &out){ void operator() (const Field &in, Field &out){
Field src(in._grid); Field src(in.Grid());
_Matrix.Mdag(in,src); _Matrix.Mdag(in,src);
_HermitianSolver(src,out); // Mdag M out = Mdag in _HermitianSolver(src,out); // Mdag M out = Mdag in
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -14,7 +14,7 @@ template<class Field> class PowerMethod
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{ {
GridBase *grid = src._grid; GridBase *grid = src.Grid();
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,97 +23,97 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
#define GRID_PREC_CONJUGATE_RESIDUAL_H #define GRID_PREC_CONJUGATE_RESIDUAL_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template<class Field> template<class Field>
class PrecConjugateResidual : public OperatorFunction<Field> { class PrecConjugateResidual : public OperatorFunction<Field> {
public: public:
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;
LinearFunction<Field> &Preconditioner; LinearFunction<Field> &Preconditioner;
PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec) PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec)
{ {
verbose=1; verbose=1;
}; };
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b, c, d; RealD a, b, c, d;
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp; RealD rAr, rAAr, rArp;
RealD pAp, pAAp; RealD pAp, pAAp;
GridBase *grid = src._grid; GridBase *grid = src.Grid();
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid); Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero; psi=zero;
r = src; r = src;
Preconditioner(r,p); Preconditioner(r,p);
Linop.HermOpAndNorm(p,Ap,pAp,pAAp); Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Ar=Ap; Ar=Ap;
rAr=pAp; rAr=pAp;
rAAr=pAAp; rAAr=pAAp;
cp =norm2(r); cp =norm2(r);
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl; if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=0;k<MaxIterations;k++){ for(int k=0;k<MaxIterations;k++){
Preconditioner(Ap,z); Preconditioner(Ap,z);
RealD rq= real(innerProduct(Ap,z)); RealD rq= real(innerProduct(Ap,z));
a = rAr/rq; a = rAr/rq;
axpy(psi,a,p,psi); axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,z,r); cp = axpy_norm(r,-a,z,r);
rArp=rAr; rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr); Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp; b =rAr/rArp;
axpy(p,b,p,r); axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar); pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl; if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
} }
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
} }
};
} std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_PREC_GCR_H #ifndef GRID_PREC_GCR_H
#define GRID_PREC_GCR_H #define GRID_PREC_GCR_H
@ -36,206 +36,208 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
//NB. Likely not original reference since they are focussing on a preconditioner variant. //NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper // but VPGCR was nicely written up in their paper
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class Field> template<class Field>
class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> { class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public: public:
RealD Tolerance; using OperatorFunction<Field>::operator();
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner; RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) : LinearFunction<Field> &Preconditioner;
Tolerance(tol),
MaxIterations(maxit),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
verbose=1;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
verbose=1;
};
psi=zero; void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD cp, ssq,rsq;
ssq=norm2(src); psi=Zero();
rsq=Tolerance*Tolerance*ssq; RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src._grid); Field r(src.Grid());
PrecTimer.Reset(); PrecTimer.Reset();
MatTimer.Reset(); MatTimer.Reset();
LinalgTimer.Reset(); LinalgTimer.Reset();
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
steps=0; steps=0;
for(int k=0;k<MaxIterations;k++){ for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(Linop,src,psi,rsq); cp=GCRnStep(Linop,src,psi,rsq);
std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl; std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
if(cp<rsq) { if(cp<rsq) {
SolverTimer.Stop(); SolverTimer.Stop();
Linop.HermOp(psi,r); Linop.HermOp(psi,r);
axpy(r,-1.0,src,r); axpy(r,-1.0,src,r);
RealD tr = norm2(r); RealD tr = norm2(r);
std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq) << " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq) << " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl; << " target " <<Tolerance <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
return;
}
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
return;
} }
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
assert(0);
} }
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
assert(0);
}
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
RealD cp; RealD cp;
RealD a, b, c, d; RealD a, b;
RealD zAz, zAAz; RealD zAz, zAAz;
RealD rAq, rq; RealD rq;
GridBase *grid = src._grid; GridBase *grid = src.Grid();
Field r(grid); Field r(grid);
Field z(grid); Field z(grid);
Field tmp(grid); Field tmp(grid);
Field ttmp(grid); Field ttmp(grid);
Field Az(grid); Field Az(grid);
//////////////////////////////// ////////////////////////////////
// history for flexible orthog // history for flexible orthog
//////////////////////////////// ////////////////////////////////
std::vector<Field> q(mmax,grid); std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid); std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax); std::vector<RealD> qq(mmax);
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start(); //////////////////////////////////
r=src-Az; // initial guess x0 is taken as nonzero.
LinalgTimer.Stop(); // r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
///////////////////// MatTimer.Start();
// p = Prec(r) Linop.HermOp(z,tmp);
///////////////////// MatTimer.Stop();
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start(); LinalgTimer.Start();
Linop.HermOp(z,tmp); ttmp=tmp;
MatTimer.Stop(); tmp=tmp-r;
LinalgTimer.Stop();
LinalgTimer.Start(); /*
ttmp=tmp;
tmp=tmp-r;
LinalgTimer.Stop();
/*
std::cout<<GridLogMessage<<r<<std::endl; std::cout<<GridLogMessage<<r<<std::endl;
std::cout<<GridLogMessage<<z<<std::endl; std::cout<<GridLogMessage<<z<<std::endl;
std::cout<<GridLogMessage<<ttmp<<std::endl; std::cout<<GridLogMessage<<ttmp<<std::endl;
std::cout<<GridLogMessage<<tmp<<std::endl; std::cout<<GridLogMessage<<tmp<<std::endl;
*/ */
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
if((k==nstep-1)||(cp<rsq)){
return cp;
}
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start(); MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz); Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop(); MatTimer.Stop();
LinalgTimer.Start(); LinalgTimer.Start();
//p[0],q[0],qq[0] tmp=tmp-r;
p[0]= z; std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r); q[peri_kp]=Az;
LinalgTimer.Stop(); p[peri_kp]=z;
for(int k=0;k<nstep;k++){ int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
steps++; int peri_back=(k-back)%mmax; assert((k-back)>=0);
int kp = k+1; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
int peri_k = k %mmax; p[peri_kp]=p[peri_kp]+b*p[peri_back];
int peri_kp= kp%mmax; q[peri_kp]=q[peri_kp]+b*q[peri_back];
LinalgTimer.Start();
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
if((k==nstep-1)||(cp<rsq)){
return cp;
}
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop();
LinalgTimer.Start();
tmp=tmp-r;
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
} }
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
assert(0); // never reached LinalgTimer.Stop();
return cp;
} }
}; assert(0); // never reached
} return cp;
}
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -297,9 +297,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e) // src_o = (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
} }
@ -317,17 +317,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even); src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even); _Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix); SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@ -366,13 +366,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@ -386,17 +386,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.checkerboard ==Even); src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.checkerboard ==Even); _Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@ -437,12 +437,12 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@ -463,12 +463,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.checkerboard ==Even); _Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.checkerboard ==Even); tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.checkerboard ==Even); _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.checkerboard ==Odd ); setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)

View File

@ -1,11 +1,12 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <fcntl.h> #include <fcntl.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr; MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false; bool MemoryProfiler::debug = false;
#ifdef POINTER_CACHE
int PointerCache::victim; int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
@ -49,9 +50,9 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
void *PointerCache::Lookup(size_t bytes) { void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL; if (bytes < 4096 ) return NULL;
#ifdef _OPENMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
@ -63,7 +64,7 @@ void *PointerCache::Lookup(size_t bytes) {
} }
return NULL; return NULL;
} }
#endif
void check_huge_pages(void *Buf,uint64_t BYTES) void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
@ -90,7 +91,7 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
++n4ktotal; ++n4ktotal;
if (pageaddr != baseaddr + j * page_size) if (pageaddr != baseaddr + j * page_size)
++nnothuge; ++nnothuge;
} }
} }
int rank = CartesianCommunicator::RankWorld(); int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
@ -106,20 +107,21 @@ std::string sizeString(const size_t bytes)
double count = bytes; double count = bytes;
while (count >= 1024 && s < 7) while (count >= 1024 && s < 7)
{ {
s++; s++;
count /= 1024; count /= 1024;
} }
if (count - floor(count) == 0.0) if (count - floor(count) == 0.0)
{ {
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
} }
else else
{ {
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
} }
return std::string(buf); return std::string(buf);
} }
} NAMESPACE_END(Grid);

View File

@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H #ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H #define GRID_ALIGNED_ALLOCATOR_H
@ -40,89 +40,95 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <mm_malloc.h> #include <mm_malloc.h>
#endif #endif
namespace Grid { #define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
class PointerCache { NAMESPACE_BEGIN(Grid);
private:
static const int Ncache=8; // Move control to configure.ac and Config.h?
static int victim; #ifdef POINTER_CACHE
class PointerCache {
private:
typedef struct { static const int Ncache=8;
void *address; static int victim;
size_t bytes;
int valid; typedef struct {
} PointerCacheEntry; void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache]; static PointerCacheEntry Entries[Ncache];
public: public:
static void *Insert(void *ptr,size_t bytes) ; static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ; static void *Lookup(size_t bytes) ;
}; };
#endif
std::string sizeString(size_t bytes);
struct MemoryStats std::string sizeString(size_t bytes);
{
size_t totalAllocated{0}, maxAllocated{0}, struct MemoryStats
currentlyAllocated{0}, totalFreed{0}; {
}; size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler class MemoryProfiler
{ {
public: public:
static MemoryStats *stats; static MemoryStats *stats;
static bool debug; static bool debug;
}; };
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")" #define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \ #define profilerDebugPrint \
if (MemoryProfiler::stats)\ if (MemoryProfiler::stats) \
{\ { \
auto s = MemoryProfiler::stats;\ auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\ std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \ std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl;\ << std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \ std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl;\ << std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \ std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl;\ << std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \ std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl;\ << std::endl; \
} }
#define profilerAllocate(bytes)\ #define profilerAllocate(bytes) \
if (MemoryProfiler::stats)\ if (MemoryProfiler::stats) \
{\ { \
auto s = MemoryProfiler::stats;\ auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes);\ s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes);\ s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\ s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
}\ } \
if (MemoryProfiler::debug)\ if (MemoryProfiler::debug) \
{\ { \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\ std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint;\ profilerDebugPrint; \
} }
#define profilerFree(bytes)\ #define profilerFree(bytes) \
if (MemoryProfiler::stats)\ if (MemoryProfiler::stats) \
{\ { \
auto s = MemoryProfiler::stats;\ auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes);\ s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes);\ s->currentlyAllocated -= (bytes); \
}\ } \
if (MemoryProfiler::debug)\ if (MemoryProfiler::debug) \
{\ { \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\ std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint;\ profilerDebugPrint; \
} }
void check_huge_pages(void *Buf,uint64_t BYTES); void check_huge_pages(void *Buf,uint64_t BYTES);
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized. // A lattice of something, but assume the something is SIMDized.
@ -152,29 +158,45 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
////////////////// #ifdef POINTER_CACHE
// Hack 2MB align; could make option probably doesn't need configurability _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else #else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); pointer ptr = nullptr;
#endif #endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop #ifdef GRID_NVCC
uint8_t *cp = (uint8_t *)ptr; ////////////////////////////////////
#ifdef GRID_OMP // Unified (managed) memory
#pragma omp parallel for ////////////////////////////////////
#endif if ( ptr == (_Tp *) NULL ) {
for(size_type n=0;n<bytes;n+=4096){ auto err = cudaMallocManaged((void **)&ptr,bytes);
cp[n]=0; if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
} }
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr; return ptr;
} }
@ -183,133 +205,40 @@ public:
profilerFree(bytes); profilerFree(bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
#ifdef HAVE_MM_MALLOC_H #ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme); if ( __freeme ) _mm_free((void *)__freeme);
#else #else
if ( __freeme ) free((void *)__freeme); if ( __freeme ) free((void *)__freeme);
#endif
#endif #endif
} }
void construct(pointer __p, const _Tp& __val) { };
// FIXME: hack for the copy constructor, eventually it must be avoided
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,commAllocator<T> >; template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
}; // namespace Grid NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CARTESIAN_H #ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H #define GRID_CARTESIAN_H

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -25,268 +25,266 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CARTESIAN_BASE_H #ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H #define GRID_CARTESIAN_BASE_H
NAMESPACE_BEGIN(Grid);
namespace Grid{ //////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid // unsigned long _ndimension;
////////////////////////////////////////////////////////////////////// // Coordinate _processors; // processor grid
// unsigned long _ndimension; // int _processor; // linear processor rank
// std::vector<int> _processors; // processor grid // Coordinate _processor_coor; // linear processor rank
// int _processor; // linear processor rank //////////////////////////////////////////////////////////////////////
// std::vector<int> _processor_coor; // linear processor rank class GridBase : public CartesianCommunicator , public GridThread {
//////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread {
public: public:
int dummy; int dummy;
// Give Lattice access // Give Lattice access
template<class object> friend class Lattice; template<class object> friend class Lattice;
GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {}; GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const std::vector<int> & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {};
virtual ~GridBase() = default; GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {};
virtual ~GridBase() = default;
// Physics Grid information. // Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes. Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal Coordinate _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed Coordinate _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed Coordinate _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
std::vector<int> _ostride; // Outer stride for each dimension Coordinate _ostride; // Outer stride for each dimension
std::vector<int> _istride; // Inner stride i.e. within simd lane Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions). int _osites; // _isites*_osites = product(dimensions).
int _isites; int _isites;
int _fsites; // _isites*_osites = product(dimensions). int _fsites; // _isites*_osites = product(dimensions).
int _gsites; int _gsites;
std::vector<int> _slice_block;// subslice information Coordinate _slice_block;// subslice information
std::vector<int> _slice_stride; Coordinate _slice_stride;
std::vector<int> _slice_nblock; Coordinate _slice_nblock;
std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d] Coordinate _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded; bool _isCheckerBoarded;
public: public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by // Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian // GridCartesian / GridRedBlackCartesian
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0; virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const std::vector<int> &site)=0; virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
virtual int CheckerBoardFromOindex (int Oindex)=0; virtual int CheckerBoardFromOindex (int Oindex)=0;
virtual int CheckerBoardFromOindexTable (int Oindex)=0; virtual int CheckerBoardFromOindexTable (int Oindex)=0;
////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations // Local layout calculations
////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into // These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord // "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord. // "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
// //
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously. // lanes are operated upon simultaneously.
virtual int oIndex(std::vector<int> &coor) virtual int oIndex(Coordinate &coor)
{ {
int idx=0; int idx=0;
// Works with either global or local coordinates // Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx; return idx;
} }
virtual int iIndex(std::vector<int> &lcoor) virtual int iIndex(Coordinate &lcoor)
{ {
int idx=0; int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx; return idx;
} }
inline int oIndexReduced(std::vector<int> &ocoor) inline int oIndexReduced(Coordinate &ocoor)
{ {
int idx=0; int idx=0;
// ocoor is already reduced so can eliminate the modulo operation // ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine // for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx; return idx;
} }
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){ inline void oCoorFromOindex (Coordinate& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
} }
inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) { inline void InOutCoorToLocalCoor (Coordinate &ocoor, Coordinate &icoor, Coordinate &lcoor) {
lcoor.resize(_ndimension); lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
} }
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// SIMD lane addressing // SIMD lane addressing
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
inline void iCoorFromIindex(std::vector<int> &coor,int lane) inline void iCoorFromIindex(Coordinate &coor,int lane)
{ {
Lexicographic::CoorFromIndex(coor,lane,_simd_layout); Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
} }
inline int PermuteDim(int dimension){ inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1; return _simd_layout[dimension]>1;
} }
inline int PermuteType(int dimension){ inline int PermuteType(int dimension){
int permute_type=0; int permute_type=0;
// //
// FIXME: // Best way to encode this would be to present a mask
// // for which simd dimensions are rotated, and the rotation
// Best way to encode this would be to present a mask // size. If there is only one simd dimension rotated, this is just
// for which simd dimensions are rotated, and the rotation // a permute.
// size. If there is only one simd dimension rotated, this is just //
// a permute. // Cases: PermuteType == 1,2,4,8
// // Distance should be either 0,1,2..
// Cases: PermuteType == 1,2,4,8 //
// Distance should be either 0,1,2.. if ( _simd_layout[dimension] > 2 ) {
// for(int d=0;d<_ndimension;d++){
if ( _simd_layout[dimension] > 2 ) { if ( d != dimension ) assert ( (_simd_layout[d]==1) );
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
} }
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type; return permute_type;
} }
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) const { return _isites; }; for(int d=_ndimension-1;d>dimension;d--){
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites if (_simd_layout[d]>1 ) permute_type++;
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const std::vector<int> LocalStarts(void) { return _lstart; };
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){ return permute_type;
assert(lidx<lSites()); }
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); ////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) const { return _isites; };
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
inline const Coordinate &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
} }
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){ }
gidx=0; void GlobalCoorToProcessorCoorLocalCoor(Coordinate &pcoor,Coordinate &lcoor,const Coordinate &gcoor)
int mult=1; {
for(int mu=0;mu<_ndimension;mu++) { pcoor.resize(_ndimension);
gidx+=mult*gcoor[mu]; lcoor.resize(_ndimension);
mult*=_gdimensions[mu]; for(int mu=0;mu<_ndimension;mu++){
} int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
} }
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor) }
{ void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const Coordinate &gcoor)
pcoor.resize(_ndimension); {
lcoor.resize(_ndimension); Coordinate pcoor;
for(int mu=0;mu<_ndimension;mu++){ Coordinate lcoor;
int _fld = _fdimensions[mu]/_processors[mu]; GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
pcoor[mu] = gcoor[mu]/_fld; rank = RankFromProcessorCoor(pcoor);
lcoor[mu] = gcoor[mu]%_fld; /*
} Coordinate cblcoor(lcoor);
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
{
std::vector<int> pcoor;
std::vector<int> lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
std::vector<int> cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){ for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) { if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2; cblcoor[d] = lcoor[d]/2;
}
} }
*/ }
i_idx= iIndex(lcoor); */
o_idx= oIndex(lcoor); i_idx= iIndex(lcoor);
} o_idx= oIndex(lcoor);
}
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor) void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , Coordinate &gcoor)
{ {
gcoor.resize(_ndimension); gcoor.resize(_ndimension);
std::vector<int> coor(_ndimension); Coordinate coor(_ndimension);
ProcessorCoorFromRank(rank,coor); ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
iCoorFromIindex(coor,i_idx); iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
oCoorFromOindex (coor,o_idx); oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,Coordinate &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
} }
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor) }
{ void ProcessorCoorLocalCoorToGlobalCoor(Coordinate &Pcoor,Coordinate &Lcoor,Coordinate &gcoor)
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); {
if(CheckerBoarded(0)){ gcoor.resize(_ndimension);
fcoor[0] = fcoor[0]*2+cb; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
} }
}
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
}; };
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,97 +23,96 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CARTESIAN_FULL_H #ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H #define GRID_CARTESIAN_FULL_H
namespace Grid{ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Grid Support. // Grid Support.
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase { class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
virtual int CheckerBoardFromOindex (int Oindex) virtual int CheckerBoardFromOindex (int Oindex)
{ {
return 0; return 0;
} }
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
return 0; return 0;
} }
virtual int CheckerBoard(const std::vector<int> &site){ virtual int CheckerBoard(const Coordinate &site){
return 0; return 0;
} }
virtual int CheckerBoardDestination(int cb,int shift,int dim){ virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0; return 0;
} }
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){ virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift; return shift;
} }
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){ virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift; return shift;
} }
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator. // Constructor takes a parent grid and possibly subdivides communicator.
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid, const Coordinate &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
} }
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid, const Coordinate &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
} }
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Construct from comm world // Construct from comm world
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid) const Coordinate &processor_grid) : GridBase(processor_grid)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
} }
virtual ~GridCartesian() = default; virtual ~GridCartesian() = default;
void Init(const std::vector<int> &dimensions, void Init(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid) const Coordinate &processor_grid)
{ {
/////////////////////// ///////////////////////
// Grid information // Grid information
/////////////////////// ///////////////////////
_isCheckerBoarded = false; _isCheckerBoarded = false;
_ndimension = dimensions.size(); _ndimension = dimensions.size();
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
_ostride.resize(_ndimension); _ostride.resize(_ndimension);
_istride.resize(_ndimension); _istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1; _fsites = _gsites = _osites = _isites = 1;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
_fdimensions[d] = dimensions[d]; // Global dimensions _fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions
@ -136,30 +135,30 @@ public:
// Addressing support // Addressing support
if (d == 0) if (d == 0)
{ {
_ostride[d] = 1; _ostride[d] = 1;
_istride[d] = 1; _istride[d] = 1;
} }
else else
{ {
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1]; _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
} }
} }
/////////////////////// ///////////////////////
// subplane information // subplane information
/////////////////////// ///////////////////////
_slice_block.resize(_ndimension); _slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension); _slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension); _slice_nblock.resize(_ndimension);
int block = 1; int block = 1;
int nblock = 1; int nblock = 1;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d]; nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
nblock /= _rdimensions[d]; nblock /= _rdimensions[d];
_slice_block[d] = block; _slice_block[d] = block;
@ -167,8 +166,9 @@ public:
_slice_nblock[d] = nblock; _slice_nblock[d] = nblock;
block = block * _rdimensions[d]; block = block * _rdimensions[d];
} }
}; };
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,178 +24,147 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CARTESIAN_RED_BLACK_H #ifndef GRID_CARTESIAN_RED_BLACK_H
#define GRID_CARTESIAN_RED_BLACK_H #define GRID_CARTESIAN_RED_BLACK_H
NAMESPACE_BEGIN(Grid);
namespace Grid { static const int CbRed =0;
static const int CbBlack=1;
static const int CbRed =0; static const int Even =CbRed;
static const int CbBlack=1; static const int Odd =CbBlack;
static const int Even =CbRed;
static const int Odd =CbBlack;
// Specialise this for red black grids storing half the data like a chess board. // Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
std::vector<int> _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(const std::vector<int> &site){ virtual int CheckerBoard(const Coordinate &site){
int linear=0; int linear=0;
assert(site.size()==_ndimension); assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d]) if(_checker_dim_mask[d])
linear=linear+site[d]; linear=linear+site[d];
}
return (linear&0x1);
} }
return (linear&0x1);
}
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
// Depending on the cb of site, we toggle source cb. int fulldim =_fdimensions[dim];
// for block #b, element #e = (b, e) shift = (shift+fulldim)%fulldim;
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
int fulldim =_fdimensions[dim]; // Probably faster with table lookup;
shift = (shift+fulldim)%fulldim; // or by looping over x,y,z and multiply rather than computing checkerboard.
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
if ( (source_cb+ocb)&1 ) { if ( (source_cb+ocb)&1 ) {
return (shift)/2; return (shift)/2;
} else { } else {
return (shift+1)/2; return (shift+1)/2;
}
} }
virtual int CheckerBoardFromOindexTable (int Oindex) { }
return _checker_board[Oindex]; virtual int CheckerBoardFromOindexTable (int Oindex) {
} return _checker_board[Oindex];
virtual int CheckerBoardFromOindex (int Oindex) }
{ virtual int CheckerBoardFromOindex (int Oindex)
std::vector<int> ocoor; {
oCoorFromOindex(ocoor,Oindex); Coordinate ocoor;
return CheckerBoard(ocoor); oCoorFromOindex(ocoor,Oindex);
} return CheckerBoard(ocoor);
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){ }
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != _checker_dim) return shift; if(dim != _checker_dim) return shift;
int ocb=CheckerBoardFromOindex(osite); int ocb=CheckerBoardFromOindex(osite);
return CheckerBoardShiftForCB(source_cb,dim,shift,ocb); return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
} }
virtual int CheckerBoardDestination(int source_cb,int shift,int dim){ virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
if ( _checker_dim_mask[dim] ) { if ( _checker_dim_mask[dim] ) {
// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims // If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
// does NOT cause a parity hop. // does NOT cause a parity hop.
int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim]; int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
if ( (shift+add) &0x1) { if ( (shift+add) &0x1) {
return 1-source_cb; return 1-source_cb;
} else {
return source_cb;
}
} else { } else {
return source_cb; return source_cb;
} }
}; } else {
return source_cb;
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
std::vector<int> checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
} }
};
virtual ~GridRedBlackCartesian() = default; ////////////////////////////////////////////////////////////
#if 0 // Create Redblack from original grid; require full grid pointer ?
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Create redblack grid ;; deprecate these. Should not GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
// need direct creation of redblack without a full grid to base on {
//////////////////////////////////////////////////////////// int dims = base->_ndimension;
GridRedBlackCartesian(const GridBase *base, Coordinate checker_dim_mask(dims,1);
const std::vector<int> &dimensions, int checker_dim = 0;
const std::vector<int> &simd_layout, Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
const std::vector<int> &processor_grid, };
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(processor_grid,*base)
{
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Create redblack grid // Create redblack from original grid, with non-trivial checker dim mask
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base, GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions, const Coordinate &checker_dim_mask,
const std::vector<int> &simd_layout, int checker_dim
const std::vector<int> &processor_grid) : GridBase(processor_grid,*base) ) : GridBase(base->_processors,*base)
{ {
std::vector<int> checker_dim_mask(dimensions.size(),1); Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
int checker_dim = 0; }
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
#endif
void Init(const std::vector<int> &dimensions, virtual ~GridRedBlackCartesian() = default;
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid, void Init(const Coordinate &dimensions,
const std::vector<int> &checker_dim_mask, const Coordinate &simd_layout,
int checker_dim) const Coordinate &processor_grid,
{ const Coordinate &checker_dim_mask,
int checker_dim)
{
_isCheckerBoarded = true; _isCheckerBoarded = true;
_checker_dim = checker_dim; _checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1); assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size(); _ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension); assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension); assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension); assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
_ostride.resize(_ndimension); _ostride.resize(_ndimension);
_istride.resize(_ndimension); _istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1; _fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask = checker_dim_mask; _checker_dim_mask = checker_dim_mask;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
_fdimensions[d] = dimensions[d]; _fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d]; _gdimensions[d] = _fdimensions[d];
@ -203,11 +172,11 @@ public:
_gsites = _gsites * _gdimensions[d]; _gsites = _gsites * _gdimensions[d];
if (d == _checker_dim) if (d == _checker_dim)
{ {
assert((_gdimensions[d] & 0x1) == 0); assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2; _gsites /= 2;
} }
_ldimensions[d] = _gdimensions[d] / _processors[d]; _ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
@ -222,42 +191,42 @@ public:
// all elements of a simd vector must have same checkerboard. // all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d // If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1) if (_simd_layout[d] > 1)
{ {
if (checker_dim_mask[d]) if (checker_dim_mask[d])
{ {
assert((_rdimensions[d] & 0x1) == 0); assert((_rdimensions[d] & 0x1) == 0);
} }
} }
_osites *= _rdimensions[d]; _osites *= _rdimensions[d];
_isites *= _simd_layout[d]; _isites *= _simd_layout[d];
// Addressing support // Addressing support
if (d == 0) if (d == 0)
{ {
_ostride[d] = 1; _ostride[d] = 1;
_istride[d] = 1; _istride[d] = 1;
} }
else else
{ {
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1]; _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
} }
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// subplane information // subplane information
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension); _slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension); _slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension); _slice_nblock.resize(_ndimension);
int block = 1; int block = 1;
int nblock = 1; int nblock = 1;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d]; nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
nblock /= _rdimensions[d]; nblock /= _rdimensions[d];
_slice_block[d] = block; _slice_block[d] = block;
@ -266,55 +235,55 @@ public:
block = block * _rdimensions[d]; block = block * _rdimensions[d];
} }
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Create a checkerboard lookup table // Create a checkerboard lookup table
//////////////////////////////////////////////// ////////////////////////////////////////////////
int rvol = 1; int rvol = 1;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
rvol = rvol * _rdimensions[d]; rvol = rvol * _rdimensions[d];
} }
_checker_board.resize(rvol); _checker_board.resize(rvol);
for (int osite = 0; osite < _osites; osite++) for (int osite = 0; osite < _osites; osite++)
{ {
_checker_board[osite] = CheckerBoardFromOindex(osite); _checker_board[osite] = CheckerBoardFromOindex(osite);
} }
}; };
protected: protected:
virtual int oIndex(std::vector<int> &coor) virtual int oIndex(Coordinate &coor)
{ {
int idx = 0; int idx = 0;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
if (d == _checker_dim) if (d == _checker_dim)
{ {
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
} }
else else
{ {
idx += _ostride[d] * (coor[d] % _rdimensions[d]); idx += _ostride[d] * (coor[d] % _rdimensions[d]);
} }
} }
return idx; return idx;
}; };
virtual int iIndex(std::vector<int> &lcoor) virtual int iIndex(Coordinate &lcoor)
{ {
int idx = 0; int idx = 0;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
if (d == _checker_dim) if (d == _checker_dim)
{ {
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
} }
else else
{ {
idx += _istride[d] * (lcoor[d] / _rdimensions[d]); idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
} }
} }
return idx; return idx;
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,11 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_COMMUNICATOR_H #ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H #define GRID_COMMUNICATOR_H
#include <Grid/util/Coordinate.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h> #include <Grid/communicator/Communicator_base.h>

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,15 +23,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <fcntl.h> #include <fcntl.h>
#include <unistd.h> #include <unistd.h>
#include <limits.h> #include <limits.h>
#include <sys/mman.h> #include <sys/mman.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
@ -47,8 +47,8 @@ int CartesianCommunicator::Dimensions(void) { return
int CartesianCommunicator::IsBoss(void) { return _processor==0; }; int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; }; int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; }; int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; const Coordinate & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; }; const Coordinate & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; }; int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -72,5 +72,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);
} }
} NAMESPACE_END(Grid);

View File

@ -1,5 +1,5 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_COMMUNICATOR_BASE_H #ifndef GRID_COMMUNICATOR_BASE_H
#define GRID_COMMUNICATOR_BASE_H #define GRID_COMMUNICATOR_BASE_H
@ -34,7 +34,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
class CartesianCommunicator : public SharedMemory { class CartesianCommunicator : public SharedMemory {
@ -52,9 +52,9 @@ public:
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
//////////////////////////////////////////// ////////////////////////////////////////////
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes. Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension; unsigned long _ndimension;
static Grid_MPI_Comm communicator_world; static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator; Grid_MPI_Comm communicator;
@ -69,34 +69,34 @@ public:
// Constructors to sub-divide a parent communicator // Constructors to sub-divide a parent communicator
// and default to comm world // and default to comm world
//////////////////////////////////////////////// ////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank); CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const std::vector<int> &pdimensions_in); CartesianCommunicator(const Coordinate &pdimensions_in);
virtual ~CartesianCommunicator(); virtual ~CartesianCommunicator();
private: private:
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Private initialise from an MPI communicator // Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private // Can use after an MPI_Comm_split, but hidden from user so private
//////////////////////////////////////////////// ////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base); void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base);
public:
public:
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls // Wraps MPI_Cart routines, or implements equivalent on other impls
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest); void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor); int RankFromProcessorCoor(Coordinate &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor); void ProcessorCoorFromRank(int rank,Coordinate &coor);
int Dimensions(void) ; int Dimensions(void) ;
int IsBoss(void) ; int IsBoss(void) ;
int BossRank(void) ; int BossRank(void) ;
int ThisRank(void) ; int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ; const Coordinate & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ; const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ; int ProcessorCount(void) ;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -197,11 +197,12 @@ public:
void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes); void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes);
template<class obj> void Broadcast(int root,obj &data) template<class obj> void Broadcast(int root,obj &data)
{ {
Broadcast(root,(void *)&data,sizeof(data)); Broadcast(root,(void *)&data,sizeof(data));
}; }
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -23,12 +23,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
@ -44,10 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) || if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
assert(0); assert(0);
}
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0);
}
} }
// Never clean up as done once. // Never clean up as done once.
@ -69,14 +74,14 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{ {
int rank; int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0); assert(ierr==0);
return rank; return rank;
} }
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{ {
coor.resize(_ndimension); coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
@ -86,7 +91,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world // Initialises from communicator_world
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
MPI_Comm optimal_comm; MPI_Comm optimal_comm;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
@ -105,12 +110,12 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
////////////////////////////////// //////////////////////////////////
// Try to subdivide communicator // Try to subdivide communicator
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
std::vector<int> parent_processor_coor(_ndimension,0); Coordinate parent_processor_coor(_ndimension,0);
std::vector<int> parent_processors (_ndimension,1); Coordinate parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc... // Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension; int pad = _ndimension-parent_ndimension;
@ -133,9 +138,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
int Nchild = Nparent/childsize; int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent); assert (childsize * Nchild == Nparent);
std::vector<int> ccoor(_ndimension); // coor within subcommunicator Coordinate ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent Coordinate scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent Coordinate ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d]; ccoor[d] = parent_processor_coor[d] % processors[d];
@ -152,36 +157,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
MPI_Comm comm_split; MPI_Comm comm_split;
if ( Nchild > 1 ) { if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Split the communicator // Split the communicator
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@ -203,7 +178,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
// Take the right SHM buffers // Take the right SHM buffers
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
SetCommunicator(comm_split); SetCommunicator(comm_split);
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Free the temp communicator // Free the temp communicator
/////////////////////////////////////////////// ///////////////////////////////////////////////
@ -220,7 +195,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
} }
} }
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base) void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base)
{ {
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo // Creates communicator, and the communicator_halo
@ -237,7 +212,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc
_Nprocessors*=_processors[i]; _Nprocessors*=_processors[i];
} }
std::vector<int> periodic(_ndimension,1); Coordinate periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator); MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor); MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@ -474,7 +449,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
std::vector<int> row(_ndimension,1); Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension); assert(dim>=0 && dim<_ndimension);
// Split the communicator // Split the communicator
@ -503,7 +478,6 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
MPI_Type_free(&object); MPI_Type_free(&object);
} }
NAMESPACE_END(Grid);
}

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,11 +23,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
@ -38,18 +38,18 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate( GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES, GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages); GlobalSharedMemory::Hugepages);
} }
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors) : CartesianCommunicator(processors)
{ {
srank=0; srank=0;
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
} }
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
_processors = processors; _processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
@ -122,8 +122,8 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
source =0; source =0;
@ -160,6 +160,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};
NAMESPACE_END(Grid);
}

View File

@ -28,10 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
// static data // static data
int GlobalSharedMemory::HPEhypercube = 1;
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL; uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0; int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup; int GlobalSharedMemory::_ShmSetup;
@ -76,6 +77,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl; std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr; return ptr;
} }
void SharedMemory::ShmBufferFreeAll(void) { void SharedMemory::ShmBufferFreeAll(void) {
@ -84,9 +86,9 @@ void SharedMemory::ShmBufferFreeAll(void) {
} }
void *SharedMemory::ShmBufferSelf(void) void *SharedMemory::ShmBufferSelf(void)
{ {
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank]; return ShmCommBufs[ShmRank];
} }
NAMESPACE_END(Grid);
}

View File

@ -25,18 +25,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once #pragma once
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
@ -57,26 +45,32 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <numaif.h> #include <numaif.h>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif
class GlobalSharedMemory { class GlobalSharedMemory {
private: private:
static const int MAXLOG2RANKSPERNODE = 16; static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation // Init once lock on the buffer allocation
static int _ShmSetup; static int _ShmSetup;
static int _ShmAlloc; static int _ShmAlloc;
static uint64_t _ShmAllocBytes; static uint64_t _ShmAllocBytes;
public: public:
///////////////////////////////////////
// HPE 8600 hypercube optimisation
///////////////////////////////////////
static int HPEhypercube;
static int ShmSetup(void) { return _ShmSetup; } static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; } static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; } static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
@ -102,14 +96,16 @@ class GlobalSharedMemory {
// Create an optimal reordered communicator that makes MPI_Cart_create get it right // Create an optimal reordered communicator that makes MPI_Cart_create get it right
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Provide shared memory facilities off comm world // Provide shared memory facilities off comm world
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
}; };
@ -118,14 +114,14 @@ class GlobalSharedMemory {
////////////////////////////// //////////////////////////////
class SharedMemory class SharedMemory
{ {
private: private:
static const int MAXLOG2RANKSPERNODE = 16; static const int MAXLOG2RANKSPERNODE = 16;
size_t heap_top; size_t heap_top;
size_t heap_bytes; size_t heap_bytes;
size_t heap_size; size_t heap_size;
protected: protected:
Grid_MPI_Comm ShmComm; // for barriers Grid_MPI_Comm ShmComm; // for barriers
int ShmRank; int ShmRank;
@ -133,7 +129,7 @@ class SharedMemory
std::vector<void *> ShmCommBufs; std::vector<void *> ShmCommBufs;
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
public: public:
SharedMemory() {}; SharedMemory() {};
~SharedMemory(); ~SharedMemory();
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
@ -150,6 +146,7 @@ class SharedMemory
// Call on any instance // Call on any instance
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
void SharedMemoryTest(void); void SharedMemoryTest(void);
void *ShmBufferSelf(void); void *ShmBufferSelf(void);
void *ShmBuffer (int rank); void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p); void *ShmBufferTranslate(int rank,void * local_p);
@ -164,4 +161,5 @@ class SharedMemory
}; };
} NAMESPACE_END(Grid);

View File

@ -29,8 +29,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
namespace Grid { #ifdef GRID_NVCC
#include <cuda_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
@ -46,6 +50,11 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank // WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes // WorldNodes
@ -130,7 +139,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
} }
return log2size; return log2size;
} }
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions // Look and see if it looks like an HPE 8600 based on hostname conventions
@ -143,10 +152,10 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
gethostname(name,namelen); gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3) OptimalCommunicatorHypercube(processors,optimal_comm); if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm); else OptimalCommunicatorSharedMemory(processors,optimal_comm);
} }
void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
@ -188,9 +197,9 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
} }
std::string hname(name); std::string hname(name);
std::cout << "hostname "<<hname<<std::endl; // std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N // std::cout << "R " << R << " I " << I << " N "<< N
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl; // << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition. // broadcast node 0's base coordinate for this partition.
@ -213,7 +222,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
std::vector<int> processor_coor(ndimension); std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension); std::vector<int> WorldDims = processors.toVector();
std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension); std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension); std::vector<int> HyperCoor(ndimension);
int dim = 0; int dim = 0;
@ -221,7 +231,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2; ShmDims[dim]*=2;
dim=(dim+1)%ndimension; dim=(dim+1)%ndimension;
} }
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
@ -240,7 +250,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
HyperCoor[d]=hcoor & msk; HyperCoor[d]=hcoor & msk;
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
hcoor = hcoor >> bits; hcoor = hcoor >> bits;
} }
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Check processor counts match // Check processor counts match
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@ -269,7 +279,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); assert(ierr==0);
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
@ -282,9 +292,9 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int>
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
std::vector<int> processor_coor(ndimension); Coordinate processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension); Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1); Coordinate NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension); Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
int dim = 0; int dim = 0;
for(int l2=0;l2<log2size;l2++){ for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
@ -330,7 +340,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int>
#ifdef GRID_MPI3_SHMGET #ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
@ -385,14 +395,101 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
} }
#endif #endif
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.
// The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.
// e.g. DGX1, supermicro board,
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
cudaSetDevice(WorldShmRank);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto err = cudaMalloc(&ShmCommBuf, bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
r,
WorldShmComm);
assert(ierr==0);
}
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -429,7 +526,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
@ -439,7 +536,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE #ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -486,7 +583,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);
@ -552,14 +649,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
} }
#endif #endif
#endif // End NVCC case for GPU device buffers
/////////////////////////////////////////////////////////////////////////
// Routines accessing shared memory should route through for GPU safety
/////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////// void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
// Global shared functionality finished {
// Now move to per communicator functionality #ifdef GRID_NVCC
//////////////////////////////////////////////////////// cudaMemset(dest,0,bytes);
#else
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else
bcopy(src,dest,bytes);
#endif
}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{ {
int rank, size; int rank, size;
@ -587,7 +701,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@ -600,6 +713,8 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier
@ -614,24 +729,26 @@ void SharedMemory::ShmBarrier(void)
void SharedMemory::SharedMemoryTest(void) void SharedMemory::SharedMemoryTest(void)
{ {
ShmBarrier(); ShmBarrier();
uint64_t check[3];
uint64_t magic = 0x5A5A5A;
if ( ShmRank == 0 ) { if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r]; check[0]=GlobalSharedMemory::WorldNode;
check[0] = GlobalSharedMemory::WorldNode; check[1]=r;
check[1] = r; check[2]=magic;
check[2] = 0x5A5A5A; GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(int r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r]; ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode); assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r); assert(check[1]==r);
assert(check[2]==0x5A5A5A); assert(check[2]==magic);
ShmBarrier();
} }
ShmBarrier();
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)
@ -645,7 +762,6 @@ void *SharedMemory::ShmBuffer(int rank)
} }
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
static int count =0;
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
@ -664,4 +780,5 @@ SharedMemory::~SharedMemory()
} }
}; };
} NAMESPACE_END(Grid);

View File

@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@ -47,7 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1; _ShmSetup=1;
} }
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
optimal_comm = WorldComm; optimal_comm = WorldComm;
} }
@ -84,10 +84,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAlloc=1; _ShmAlloc=1;
}; };
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{ {
assert(GlobalSharedMemory::ShmAlloc()==1); assert(GlobalSharedMemory::ShmAlloc()==1);
@ -125,4 +125,5 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
SharedMemory::~SharedMemory() SharedMemory::~SharedMemory()
{}; {};
} NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_CSHIFT_H_ #ifndef _GRID_CSHIFT_H_
#define _GRID_CSHIFT_H_ #define _GRID_CSHIFT_H_

View File

@ -25,10 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_CSHIFT_COMMON_H_ #pragma once
#define _GRID_CSHIFT_COMMON_H_
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
@ -36,20 +35,21 @@ namespace Grid {
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0; int ent = 0;
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static Vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs.Grid()->_slice_stride[dimension];
int stride=rhs._grid->_slice_stride[dimension]; auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -63,66 +63,68 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b); table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
parallel_for(int i=0;i<ent;i++){ thread_for(i,ent,{
buffer[table[i].first]=rhs._odata[table[i].second]; buffer[table[i].first]=rhs_v[table[i].second];
} });
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split // Gather for when there *is* need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) Gather_plane_extract(const Lattice<vobj> &rhs,
ExtractPointerArray<typename vobj::scalar_object> pointers,
int dimension,int plane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
parallel_for_nest2(int n=0;n<e1;n++){ thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs._odata[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
} });
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl; std::cout << " Dense packed buffer WARNING " <<std::endl;
parallel_for_nest2(int n=0;n<e1;n++){ thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o=n*n1; int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs._odata[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
} }
} });
} }
} }
@ -131,17 +133,17 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0; int ent =0;
@ -150,8 +152,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b); table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@ -160,57 +162,60 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int bo=0; int bo=0;
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++); table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
} }
} }
} }
parallel_for(int i=0;i<ent;i++){ auto rhs_v = rhs.View();
rhs._odata[table[i].first]=buffer[table[i].second]; thread_for(i,ent,{
} rhs_v[table[i].first]=buffer[table[i].second];
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split // Scatter for when there *is* need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){ auto rhs_v = rhs.View();
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
} }
} });
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View();
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
} }
} }
} }
@ -222,18 +227,18 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0; int ent=0;
@ -248,7 +253,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o); table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
@ -256,32 +261,33 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
} }
} }
parallel_for(int i=0;i<ent;i++){ auto rhs_v = rhs.View();
lhs._odata[table[i].first]=rhs._odata[table[i].second]; auto lhs_v = lhs.View();
} thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block [dimension]; int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0; int ent=0;
double t_tab,t_perm;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -292,14 +298,16 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
parallel_for(int i=0;i<ent;i++){ auto rhs_v = rhs.View();
permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type); auto lhs_v = lhs.View();
} thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -309,11 +317,9 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
double t_local;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3); Cshift_local(ret,rhs,dimension,shift,0x3);
} else { } else {
@ -324,7 +330,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
GridBase *grid = rhs._grid; GridBase *grid = rhs.Grid();
int fd = grid->_fdimensions[dimension]; int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension]; int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension]; int ld = grid->_ldimensions[dimension];
@ -335,18 +341,18 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
// the permute type // the permute type
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
int permute_dim =grid->PermuteDim(dimension); int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension); int permute_type=grid->PermuteType(dimension);
int permute_type_dist; int permute_type_dist;
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
int o = 0; // int o = 0;
int bo = x * grid->_ostride[dimension]; int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
// wrap is whether sshift > rd. // wrap is whether sshift > rd.
@ -387,5 +393,5 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
} }
} }
} NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,33 +24,33 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_CSHIFT_MPI_H_ #ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs._grid); Lattice<vobj> ret(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension]; int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension. // Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type // the permute type
int simd_layout = rhs._grid->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim); int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
if ( !comm_dim ) { if ( !comm_dim ) {
@ -70,10 +70,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3); Cshift_comms(ret,rhs,dimension,shift,0x3);
@ -88,8 +88,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
@ -107,25 +107,25 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs._grid; GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs._grid); Lattice<vobj> temp(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension]; int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs._grid->_processors[dimension]; int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1); assert(simd_layout==1);
assert(comm_dim==1); assert(comm_dim==1);
assert(shift>=0); assert(shift>=0);
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size); commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size); commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
@ -145,7 +145,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
@ -165,7 +165,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
GridBase *grid=rhs._grid; GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@ -193,21 +193,21 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// Simd direction uses an extract/merge pair // Simd direction uses an extract/merge pair
/////////////////////////////////////////////// ///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); // ExtractPointerArray<scalar_object> pointers(Nsimd); //
std::vector<scalar_object *> rpointers(Nsimd); // received pointers ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
/////////////////////////////////////////// ///////////////////////////////////////////
// Work out what to send where // Work out what to send where
/////////////////////////////////////////// ///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even; int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim // loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
@ -257,6 +257,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
} }
}
} }
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,17 +23,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_CSHIFT_NONE_H_ #ifndef _GRID_CSHIFT_NONE_H_
#define _GRID_CSHIFT_NONE_H_ #define _GRID_CSHIFT_NONE_H_
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
Lattice<vobj> ret(rhs._grid); Lattice<vobj> ret(rhs.Grid());
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
Cshift_local(ret,rhs,dimension,shift); Cshift_local(ret,rhs,dimension,shift);
return ret; return ret;
} }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,3 +1,4 @@
#ifndef __NVCC__
/* /*
__ _____ _____ _____ __ _____ _____ _____
__| | __| | | | JSON for Modern C++ __| | __| | | | JSON for Modern C++
@ -18918,3 +18919,4 @@ inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std
#endif #endif
#endif

View File

@ -25,9 +25,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_H #pragma once
#define GRID_LATTICE_H
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h>
#include <Grid/lattice/Lattice_arith.h>
#include <Grid/lattice/Lattice_trace.h>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h>
//#include <Grid/lattice/Lattice_where.h>
#include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#endif

View File

@ -27,7 +27,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_ET_H #ifndef GRID_LATTICE_ET_H
#define GRID_LATTICE_ET_H #define GRID_LATTICE_ET_H
@ -36,13 +36,13 @@ directory
#include <typeinfo> #include <typeinfo>
#include <vector> #include <vector>
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const robj &iffalse) { const robj &iffalse) {
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
@ -51,11 +51,10 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd(); const int Nsimd = vobj::vector_type::Nsimd();
const int words = sizeof(vobj) / sizeof(vector_type);
std::vector<Integer> mask(Nsimd); ExtractBuffer<Integer> mask(Nsimd);
std::vector<scalar_object> truevals(Nsimd); ExtractBuffer<scalar_object> truevals(Nsimd);
std::vector<scalar_object> falsevals(Nsimd); ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals); extract(iftrue, truevals);
extract(iffalse, falsevals); extract(iffalse, falsevals);
@ -69,158 +68,148 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
return ret; return ret;
} }
//////////////////////////////////////////// /////////////////////////////////////////////////////
// recursive evaluation of expressions; Could
// switch to generic approach with variadics, a la
// Antonin's Lat Sim but the repack to variadic with popped
// from tuple is hideous; C++14 introduces std::make_index_sequence for this
////////////////////////////////////////////
// leaf eval of lattice ; should enable if protect using traits
template <typename T>
using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T>
using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
//Specialization of getVectorType for lattices //Specialization of getVectorType for lattices
/////////////////////////////////////////////////////
template<typename T> template<typename T>
struct getVectorType<Lattice<T> >{ struct getVectorType<Lattice<T> >{
typedef typename Lattice<T>::vector_object type; typedef typename Lattice<T>::vector_object type;
}; };
template<class sobj> ////////////////////////////////////////////
inline sobj eval(const unsigned int ss, const sobj &arg) //-- recursive evaluation of expressions; --
// handle leaves of syntax tree
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj>
inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) { template <class lobj> accelerator_inline
return arg._odata[ss]; const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{
return arg[ss];
}
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.View();
return view[ss];
} }
// handle nodes in syntax tree ///////////////////////////////////////////////////
template <typename Op, typename T1> // handle nodes in syntax tree- eval one operand
auto inline eval( ///////////////////////////////////////////////////
const unsigned int ss, template <typename Op, typename T1> accelerator_inline
const LatticeUnaryExpression<Op, T1> &expr) // eval one operand auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) { -> decltype(expr.op.func( eval(ss, expr.arg1)))
return expr.first.func(eval(ss, std::get<0>(expr.second))); {
return expr.op.func( eval(ss, expr.arg1) );
} }
///////////////////////
template <typename Op, typename T1, typename T2> // eval two operands
auto inline eval( ///////////////////////
const unsigned int ss, template <typename Op, typename T1, typename T2> accelerator_inline
const LatticeBinaryExpression<Op, T1, T2> &expr) // eval two operands auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
eval(ss, std::get<1>(expr.second)))) { {
return expr.first.func(eval(ss, std::get<0>(expr.second)), return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
eval(ss, std::get<1>(expr.second)));
} }
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> // eval three operands
auto inline eval(const unsigned int ss, ///////////////////////
const LatticeTrinaryExpression<Op, T1, T2, T3> template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
&expr) // eval three operands auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
eval(ss, std::get<1>(expr.second)), {
eval(ss, std::get<2>(expr.second)))) { return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)));
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the grid from an expression, ensuring conformable. This must follow a // Obtain the grid from an expression, ensuring conformable. This must follow a
// tree recursion // tree recursion; must retain grid pointer in the LatticeView class which sucks
// Use a different method, and make it void *.
// Perhaps a conformable method.
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <class T1, template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> accelerator_inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
{ {
if (grid) { lat.Conformable(grid);
conformable(grid, lat._grid);
}
grid = lat._grid;
} }
template <class T1,
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid, accelerator_inline
const T1 &notlat) // non-lattice leaf void GridFromExpression(GridBase *&grid,const T1 &notlat) // non-lattice leaf
{} {}
template <typename Op, typename T1> template <typename Op, typename T1>
inline void GridFromExpression(GridBase *&grid, accelerator_inline
const LatticeUnaryExpression<Op, T1> &expr) { void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression<Op, T1> &expr)
GridFromExpression(grid, std::get<0>(expr.second)); // recurse {
GridFromExpression(grid, expr.arg1); // recurse
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1, typename T2>
inline void GridFromExpression( accelerator_inline
GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) { void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr)
GridFromExpression(grid, std::get<0>(expr.second)); // recurse {
GridFromExpression(grid, std::get<1>(expr.second)); GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, expr.arg2);
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void GridFromExpression( accelerator_inline
GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) { void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
GridFromExpression(grid, std::get<0>(expr.second)); // recurse {
GridFromExpression(grid, std::get<1>(expr.second)); GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, std::get<2>(expr.second)); GridFromExpression(grid, expr.arg2); // recurse
GridFromExpression(grid, expr.arg3); // recurse
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the CB from an expression, ensuring conformable. This must follow a // Obtain the CB from an expression, ensuring conformable. This must follow a
// tree recursion // tree recursion
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <class T1, template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.checkerboard); assert(cb == lat.Checkerboard());
} }
cb = lat.checkerboard; cb = lat.Checkerboard();
// std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
} }
template <class T1, template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{ {
// std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
}
template <typename Op, typename T1>
inline void CBFromExpression(int &cb,
const LatticeUnaryExpression<Op, T1> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
// std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1> inline
inline void CBFromExpression(int &cb, void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
const LatticeBinaryExpression<Op, T1, T2> &expr) { {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, std::get<1>(expr.second)); }
// std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, expr.arg2); // recurse AST
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void CBFromExpression( inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) { {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, std::get<1>(expr.second)); CBFromExpression(cb, expr.arg2); // recurse AST
CBFromExpression(cb, std::get<2>(expr.second)); CBFromExpression(cb, expr.arg3); // recurse AST
// std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
} }
//////////////////////////////////////////// ////////////////////////////////////////////
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> \ template <class arg> \
struct name { \ struct name { \
static auto inline func(const arg a) -> decltype(ret) { return ret; } \ static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
}; };
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
@ -250,19 +239,21 @@ GridUnopClass(UnaryExp, exp(a));
//////////////////////////////////////////// ////////////////////////////////////////////
// Binary operators // Binary operators
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \ template <class left, class right> \
struct name { \ struct name { \
static auto inline func(const left &lhs, const right &rhs) \ static auto accelerator_inline \
-> decltype(combination) const { \ func(const left &lhs, const right &rhs) \
return combination; \ -> decltype(combination) const \
} \ { \
} return combination; \
} \
};
GridBinOpClass(BinaryAdd, lhs + rhs); GridBinOpClass(BinaryAdd, lhs + rhs);
GridBinOpClass(BinarySub, lhs - rhs); GridBinOpClass(BinarySub, lhs - rhs);
GridBinOpClass(BinaryMul, lhs *rhs); GridBinOpClass(BinaryMul, lhs *rhs);
GridBinOpClass(BinaryDiv, lhs /rhs); GridBinOpClass(BinaryDiv, lhs /rhs);
GridBinOpClass(BinaryAnd, lhs &rhs); GridBinOpClass(BinaryAnd, lhs &rhs);
GridBinOpClass(BinaryOr, lhs | rhs); GridBinOpClass(BinaryOr, lhs | rhs);
GridBinOpClass(BinaryAndAnd, lhs &&rhs); GridBinOpClass(BinaryAndAnd, lhs &&rhs);
@ -271,92 +262,71 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Trinary conditional op // Trinary conditional op
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \ template <class predicate, class left, class right> \
struct name { \ struct name { \
static auto inline func(const predicate &pred, const left &lhs, \ static auto accelerator_inline \
const right &rhs) -> decltype(combination) const { \ func(const predicate &pred, const left &lhs, const right &rhs) \
return combination; \ -> decltype(combination) const \
} \ { \
} return combination; \
} \
};
GridTrinOpClass( GridTrinOpClass(TrinaryWhere,
TrinaryWhere, (predicatedWhere<predicate,
(predicatedWhere<predicate, typename std::remove_reference<left>::type, typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs, typename std::remove_reference<right>::type>(pred, lhs,rhs)));
rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
//////////////////////////////////////////// ////////////////////////////////////////////
#define GRID_UNOP(name) name<decltype(eval(0, arg))> #define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) \ #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, \ template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
typename std::enable_if<is_lattice<T1>::value || \ inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
is_lattice_expr<T1>::value, \ { \
T1>::type * = nullptr> \ return LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
inline auto op(const T1 &arg) \
->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg))); \
} }
#define GRID_BINOP_LEFT(op, name) \ #define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value || \ typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
is_lattice_expr<T1>::value, \ inline auto op(const T1 &lhs, const T2 &rhs) \
T1>::type * = nullptr> \ ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs)) \
inline auto op(const T1 &lhs, const T2 &rhs) \ { \
->decltype( \ return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs);\
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_BINOP_RIGHT(op, name) \ #define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value && \ typename std::enable_if<!is_lattice<T1>::value&&!is_lattice_expr<T1>::value,T1>::type * = nullptr, \
!is_lattice_expr<T1>::value, \ typename std::enable_if< is_lattice<T2>::value|| is_lattice_expr<T2>::value,T2>::type * = nullptr> \
T1>::type * = nullptr, \ inline auto op(const T1 &lhs, const T2 &rhs) \
typename std::enable_if<is_lattice<T2>::value || \ ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs)) \
is_lattice_expr<T2>::value, \ { \
T2>::type * = nullptr> \ return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs); \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype( \
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_DEF_BINOP(op, name) \ #define GRID_DEF_BINOP(op, name) \
GRID_BINOP_LEFT(op, name); \ GRID_BINOP_LEFT(op, name); \
GRID_BINOP_RIGHT(op, name); GRID_BINOP_RIGHT(op, name);
#define GRID_DEF_TRINOP(op, name) \ #define GRID_DEF_TRINOP(op, name) \
template <typename T1, typename T2, typename T3> \ template <typename T1, typename T2, typename T3> \
inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \ inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
->decltype( \ ->decltype(LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs)) \
LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \ { \
const T3 &>(std::make_pair( \ return LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs); \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) { \
return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs))); \
} }
//////////////////////// ////////////////////////
// Operator definitions // Operator definitions
//////////////////////// ////////////////////////
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
@ -400,29 +370,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> { -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret( {
expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
eval(0, std::get<1>(expr.second))))> { {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
eval(0, std::get<1>(expr.second))))>
ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, std::get<1>(expr.second)), eval(0, expr.arg2),
eval(0, std::get<2>(expr.second))))> { eval(0, expr.arg3)))>
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), {
eval(0, std::get<1>(expr.second)), Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, std::get<2>(expr.second))))> eval(0, expr.arg2),
ret(expr); eval(0, expr.arg3)))> ret(expr);
return ret; return ret;
} }
@ -433,34 +401,7 @@ auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
#undef GRID_DEF_UNOP #undef GRID_DEF_UNOP
#undef GRID_DEF_BINOP #undef GRID_DEF_BINOP
#undef GRID_DEF_TRINOP #undef GRID_DEF_TRINOP
}
#if 0 NAMESPACE_END(Grid);
using namespace Grid;
int main(int argc,char **argv){
Lattice<double> v1(16);
Lattice<double> v2(16);
Lattice<double> v3(16);
BinaryAdd<double,double> tmp;
LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &>
expr(std::make_pair(tmp,
std::forward_as_tuple(v1,v2)));
tmp.func(eval(0,v1),eval(0,v2));
auto var = v1+v2;
std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
v3=v1+v2;
v3=v1+v2+v1*v2;
};
void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
{
v3=v1+v2+v1*v2;
}
#endif
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,233 +23,235 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_ARITH_H #ifndef GRID_LATTICE_ARITH_H
#define GRID_LATTICE_ARITH_H #define GRID_LATTICE_ARITH_H
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs);
conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mult(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
obj1 tmp;
mac(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs._odata[ss],&rhs);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs._odata[ss],&rhs);
#endif
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mult(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
mac(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
sub(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
#ifdef STREAMING_STORES
obj1 tmp;
add(&tmp,&lhs,&rhs._odata[ss]);
vstream(ret._odata[ss],tmp);
#else
add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
#endif
}
}
template<class sobj,class vobj> strong_inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+y._odata[ss];
vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
#endif
}
}
template<class sobj,class vobj> strong_inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){
#ifdef STREAMING_STORES
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vstream(ret._odata[ss],tmp);
#else
ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
#endif
}
}
template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
return axpby_norm_fast(ret,a,b,x,y);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
conformable(ret,rhs);
conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t = lhs_v(ss);
auto rhs_t = rhs_v(ss);
mult(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
} }
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
sub(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
add(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
sub(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
add(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
mult(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
sub(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
add(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpby_norm_fast(ret,a,b,x,y);
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -27,349 +27,440 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_BASE_H #pragma once
#define GRID_LATTICE_BASE_H
#define STREAMING_STORES #define STREAMING_STORES
namespace Grid { NAMESPACE_BEGIN(Grid);
// TODO:
// mac,real,imag
// Functionality:
// -=,+=,*=,()
// add,+,sub,-,mult,mac,*
// adj,conjugate
// real,imag
// transpose,transposeIndex
// trace,traceIndex
// peekIndex
// innerProduct,outerProduct,
// localNorm2
// localInnerProduct
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
//////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Basic expressions used in Expression Template // Base class which can be used by traits to pick up behaviour
//////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
class LatticeBase {};
class LatticeBase /////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{ {
public: assert(lhs == rhs);
virtual ~LatticeBase(void) = default;
GridBase *_grid;
};
class LatticeExpressionBase {};
template <typename Op, typename T1>
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
public:
LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {};
};
template <typename Op, typename T1, typename T2>
class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase {
public:
LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {};
};
template <typename Op, typename T1, typename T2, typename T3>
class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase {
public:
LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
};
void inline conformable(GridBase *lhs,GridBase *rhs)
{
assert((lhs == rhs) && " conformable check pointers mismatch ");
} }
template<class vobj> ////////////////////////////////////////////////////////////////////////////
class Lattice : public LatticeBase // Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{ {
public: public:
int checkerboard;
Vector<vobj> _odata;
// to pthread need a computable loop where loop induction is not required
int begin(void) { return 0;};
int end(void) { return _odata.size(); }
vobj & operator[](int i) { return _odata[i]; };
const vobj & operator[](int i) const { return _odata[i]; };
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public: public:
typedef typename vobj::scalar_type scalar_type; typedef typename ViewMap<_T1>::Type T1;
typedef typename vobj::vector_type vector_type; Op op;
typedef vobj vector_object; T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
/////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class Lattice : public LatticeAccelerator<vobj>
{
public:
GridBase *Grid(void) const { return this->_grid; }
///////////////////////////////////////////////////
// Member types
///////////////////////////////////////////////////
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef vobj vector_object;
private:
void dealloc(void)
{
alignedAllocator<vobj> alloc;
if( this->_odata_size ) {
alloc.deallocate(this->_odata,this->_odata_size);
this->_odata=nullptr;
this->_odata_size=0;
}
}
void resize(uint64_t size)
{
alignedAllocator<vobj> alloc;
if ( this->_odata_size != size ) {
dealloc();
}
this->_odata_size = size;
if ( size )
this->_odata = alloc.allocate(this->_odata_size);
else
this->_odata = nullptr;
}
public:
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas
/////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
~Lattice() {
if ( this->_odata_size ) {
dealloc();
}
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Expression Template closure support // Expression Template closure support
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr) template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){ auto me = View();
#ifdef STREAMING_STORES accelerator_for(ss,me.size(),1,{
vobj tmp = eval(ss,expr); auto tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp); vstream(me[ss],tmp);
#else });
_odata[ss]=eval(ss,expr);
#endif
}
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){ auto me = View();
#ifdef STREAMING_STORES accelerator_for(ss,me.size(),1,{
vobj tmp = eval(ss,expr); auto tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp); vstream(me[ss],tmp);
#else });
_odata[ss]=eval(ss,expr);
#endif
}
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
auto me = View();
parallel_for(int ss=0;ss<_grid->oSites();ss++){ accelerator_for(ss,me.size(),1,{
#ifdef STREAMING_STORES auto tmp = eval(ss,expr);
//vobj tmp = eval(ss,expr); vstream(me[ss],tmp);
vstream(_odata[ss] ,eval(ss,expr)); });
#else
_odata[ss] = eval(ss,expr);
#endif
}
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
template<class Op,class T1> template<class Op,class T1>
Lattice(const LatticeUnaryExpression<Op,T1> & expr) { Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
_grid = nullptr; this->_grid = nullptr;
GridFromExpression(_grid,expr); GridFromExpression(this->_grid,expr);
assert(_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
_odata.resize(_grid->oSites()); resize(this->_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES *this = expr;
vobj tmp = eval(ss,expr); }
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
};
template<class Op,class T1, class T2> template<class Op,class T1, class T2>
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
_grid = nullptr; this->_grid = nullptr;
GridFromExpression(_grid,expr); GridFromExpression(this->_grid,expr);
assert(_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
_odata.resize(_grid->oSites()); resize(this->_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES *this = expr;
vobj tmp = eval(ss,expr); }
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
}
};
template<class Op,class T1, class T2, class T3> template<class Op,class T1, class T2, class T3>
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
_grid = nullptr; this->_grid = nullptr;
GridFromExpression(_grid,expr); GridFromExpression(this->_grid,expr);
assert(_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
_odata.resize(_grid->oSites()); resize(this->_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
vstream(_odata[ss] ,eval(ss,expr)); *this = expr;
} }
};
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View();
thread_for(ss,me.size(),{
me[ss] = r;
});
return *this;
}
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// Constructor requires "grid" passed. // Follow rule of five, with Constructor requires "grid" passed
// what about a default grid? // to user defined constructor
////////////////////////////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid) : _odata(grid->oSites()) { // user defined constructor
_grid = grid; ///////////////////////////////////////////
// _odata.reserve(_grid->oSites()); Lattice(GridBase *grid) {
// _odata.resize(_grid->oSites()); this->_grid = grid;
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl; resize(this->_grid->oSites());
assert((((uint64_t)&_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
checkerboard=0; this->checkerboard=0;
} }
Lattice(const Lattice& r){ // copy constructor // virtual ~Lattice(void) = default;
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
Lattice(Lattice&& r){ // move constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata=std::move(r._odata);
}
inline Lattice<vobj> & operator = (Lattice<vobj> && r)
{
_grid = r._grid;
checkerboard = r.checkerboard;
_odata =std::move(r._odata);
return *this;
}
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
virtual ~Lattice(void) = default;
void reset(GridBase* grid) { void reset(GridBase* grid) {
if (_grid != grid) { if (this->_grid != grid) {
_grid = grid; this->_grid = grid;
_odata.resize(grid->oSites()); this->_odata.resize(grid->oSites());
checkerboard = 0; this->checkerboard = 0;
} }
} }
///////////////////////////////////////////
// copy constructor
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ ///////////////////////////////////////////
parallel_for(int ss=0;ss<_grid->oSites();ss++){ Lattice(const Lattice& r){
this->_odata[ss]=r; // std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
} this->_grid = r.Grid();
resize(this->_grid->oSites());
*this = r;
}
///////////////////////////////////////////
// move constructor
///////////////////////////////////////////
Lattice(Lattice && r){
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
}
///////////////////////////////////////////
// assignment template
///////////////////////////////////////////
template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this; return *this;
} }
///////////////////////////////////////////
// Copy assignment
///////////////////////////////////////////
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this;
}
///////////////////////////////////////////
// Move assignment possible if same type
///////////////////////////////////////////
inline Lattice<vobj> & operator = (Lattice<vobj> && r){
resize(0); // deletes if appropriate
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
return *this;
}
/////////////////////////////////////////////////////////////////////////////
// *=,+=,-= operators inherit behvour from correspond */+/- operation // *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) { /////////////////////////////////////////////////////////////////////////////
template<class T> inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r; *this = (*this)*r;
return *this; return *this;
} }
template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) { template<class T> inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r; *this = (*this)-r;
return *this; return *this;
} }
template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) { template<class T> inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r; *this = (*this)+r;
return *this; return *this;
} }
}; // class Lattice
friend inline void swap(Lattice &l, Lattice &r) {
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ conformable(l,r);
std::vector<int> gcoor; LatticeAccelerator<vobj> tmp;
typedef typename vobj::scalar_object sobj; LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
sobj ss; LatticeAccelerator<vobj> *rp = (LatticeAccelerator<vobj> *)&r;
for(int g=0;g<o._grid->_gsites;g++){ tmp = *lp; *lp=*rp; *rp=tmp;
o._grid->GlobalIndexToGlobalCoor(g,gcoor);
peekSite(ss,o,gcoor);
stream<<"[";
for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
return stream;
} }
}; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
sobj ss;
peekSite(ss,o,gcoor);
stream<<"[";
for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
return stream;
} }
NAMESPACE_END(Grid);
#include "Lattice_conformable.h"
#define GRID_LATTICE_EXPRESSION_TEMPLATES
#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES
#include "Lattice_ET.h"
#else
#include "Lattice_overload.h"
#endif
#include "Lattice_arith.h"
#include "Lattice_trace.h"
#include "Lattice_transpose.h"
#include "Lattice_local.h"
#include "Lattice_reduction.h"
#include "Lattice_peekpoke.h"
#include "Lattice_reality.h"
#include "Lattice_comparison_utils.h"
#include "Lattice_comparison.h"
#include "Lattice_coordinate.h"
#include "Lattice_where.h"
#include "Lattice_rng.h"
#include "Lattice_unary.h"
#include "Lattice_transfer.h"
#endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,146 +24,184 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_COMPARISON_H #ifndef GRID_LATTICE_COMPARISON_H
#define GRID_LATTICE_COMPARISON_H #define GRID_LATTICE_COMPARISON_H
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// relational operators // relational operators
// //
// Support <,>,<=,>=,==,!= // Support <,>,<=,>=,==,!=
// //
//Query supporting bitwise &, |, ^, ! //Query supporting bitwise &, |, ^, !
//Query supporting logical &&, ||, //Query supporting logical &&, ||,
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////// typedef iScalar<vInteger> vPredicate ;
// compare lattice to lattice
////////////////////////////////////////////////////////////////////////// /*
template<class vfunctor,class lobj,class robj> template <class iobj, class vobj, class robj> accelerator_inline
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{ {
Lattice<vInteger> ret(rhs._grid); typename std::remove_const<vobj>::type ret;
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); typedef typename vobj::scalar_object scalar_object;
} typedef typename vobj::scalar_type scalar_type;
return ret; typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
} }
//////////////////////////////////////////////////////////////////////////
// compare lattice to scalar merge(ret, falsevals);
////////////////////////////////////////////////////////////////////////// return ret;
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vInteger> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare scalar to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vInteger> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=op(lhs._odata[ss],rhs);
}
return ret;
}
//////////////////////////////////////////////////////////////////////////
// Map to functors
//////////////////////////////////////////////////////////////////////////
// Less than
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
} }
*/
//////////////////////////////////////////////////////////////////////////
// compare lattice to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare lattice to scalar
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare scalar to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// Map to functors
//////////////////////////////////////////////////////////////////////////
// Less than
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -26,10 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_COMPARISON_H
#define GRID_COMPARISON_H
namespace Grid { #pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////// /////////////////////////////////////////
// This implementation is a bit poor. // This implementation is a bit poor.
@ -44,42 +44,42 @@ namespace Grid {
// //
template<class lobj,class robj> class veq { template<class lobj,class robj> class veq {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) == (rhs); return (lhs) == (rhs);
} }
}; };
template<class lobj,class robj> class vne { template<class lobj,class robj> class vne {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) != (rhs); return (lhs) != (rhs);
} }
}; };
template<class lobj,class robj> class vlt { template<class lobj,class robj> class vlt {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) < (rhs); return (lhs) < (rhs);
} }
}; };
template<class lobj,class robj> class vle { template<class lobj,class robj> class vle {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) <= (rhs); return (lhs) <= (rhs);
} }
}; };
template<class lobj,class robj> class vgt { template<class lobj,class robj> class vgt {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) > (rhs); return (lhs) > (rhs);
} }
}; };
template<class lobj,class robj> class vge { template<class lobj,class robj> class vge {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) >= (rhs); return (lhs) >= (rhs);
} }
@ -88,42 +88,42 @@ namespace Grid {
// Generic list of functors // Generic list of functors
template<class lobj,class robj> class seq { template<class lobj,class robj> class seq {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) == (rhs); return (lhs) == (rhs);
} }
}; };
template<class lobj,class robj> class sne { template<class lobj,class robj> class sne {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) != (rhs); return (lhs) != (rhs);
} }
}; };
template<class lobj,class robj> class slt { template<class lobj,class robj> class slt {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) < (rhs); return (lhs) < (rhs);
} }
}; };
template<class lobj,class robj> class sle { template<class lobj,class robj> class sle {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) <= (rhs); return (lhs) <= (rhs);
} }
}; };
template<class lobj,class robj> class sgt { template<class lobj,class robj> class sgt {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) > (rhs); return (lhs) > (rhs);
} }
}; };
template<class lobj,class robj> class sge { template<class lobj,class robj> class sge {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) >= (rhs); return (lhs) >= (rhs);
} }
@ -133,12 +133,12 @@ namespace Grid {
// Integer and real get extra relational functions. // Integer and real get extra relational functions.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<scalar> vrhs(vsimd::Nsimd()); ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
std::vector<Integer> vpred(vsimd::Nsimd()); ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(lhs,vlhs); extract<vsimd,scalar>(lhs,vlhs);
extract<vsimd,scalar>(rhs,vrhs); extract<vsimd,scalar>(rhs,vrhs);
@ -150,11 +150,11 @@ namespace Grid {
} }
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd()); ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(lhs,vlhs); extract<vsimd,scalar>(lhs,vlhs);
for(int s=0;s<vsimd::Nsimd();s++){ for(int s=0;s<vsimd::Nsimd();s++){
@ -165,11 +165,11 @@ namespace Grid {
} }
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd()); ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(rhs,vrhs); extract<vsimd,scalar>(rhs,vrhs);
for(int s=0;s<vsimd::Nsimd();s++){ for(int s=0;s<vsimd::Nsimd();s++){
@ -181,30 +181,30 @@ namespace Grid {
#define DECLARE_RELATIONAL_EQ(op,functor) \ #define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd>\ template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \ accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \ { \
return lhs._internal op rhs; \ return lhs._internal op rhs; \
} \ } \
template<class vsimd>\ template<class vsimd>\
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \ accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \ { \
return lhs op rhs._internal; \ return lhs op rhs._internal; \
} \ } \
@ -212,7 +212,7 @@ namespace Grid {
#define DECLARE_RELATIONAL(op,functor) \ #define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \ DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\ template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\ accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \ { \
return lhs._internal op rhs._internal; \ return lhs._internal op rhs._internal; \
} }
@ -226,7 +226,7 @@ DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL #undef DECLARE_RELATIONAL
} NAMESPACE_END(Grid);
#endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,18 +23,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_CONFORMABLE_H #ifndef GRID_LATTICE_CONFORMABLE_H
#define GRID_LATTICE_CONFORMABLE_H #define GRID_LATTICE_CONFORMABLE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs._grid == rhs._grid);
assert(lhs.checkerboard == rhs.checkerboard);
}
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs.Grid() == rhs.Grid());
assert(lhs.Checkerboard() == rhs.Checkerboard());
} }
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,34 +23,52 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_COORDINATE_H #pragma once
#define GRID_LATTICE_COORDINATE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu) template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
{ {
typedef typename iobj::scalar_type scalar_type; typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type; typedef typename iobj::vector_type vector_type;
GridBase *grid = l._grid; GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
std::vector<int> gcoor; Coordinate gcoor;
std::vector<scalar_type> mergebuf(Nsimd); ExtractBuffer<scalar_type> mergebuf(Nsimd);
vector_type vI; vector_type vI;
for(int o=0;o<grid->oSites();o++){ auto l_v = l.View();
for(int i=0;i<grid->iSites();i++){ for(int o=0;o<grid->oSites();o++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); for(int i=0;i<grid->iSites();i++){
mergebuf[i]=(Integer)gcoor[mu]; grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
} mergebuf[i]=(Integer)gcoor[mu];
merge<vector_type,scalar_type>(vI,mergebuf); }
l._odata[o]=vI; merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI;
}
};
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
} }
}; }}
} }
#endif
NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_LOCALREDUCTION_H #ifndef GRID_LATTICE_LOCALREDUCTION_H
#define GRID_LATTICE_LOCALREDUCTION_H #define GRID_LATTICE_LOCALREDUCTION_H
@ -32,44 +32,56 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// localInner, localNorm, outerProduct // localInner, localNorm, outerProduct
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Non site, reduced locally reduced routines // Non site, reduced locally reduced routines
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// localNorm2, // localNorm2,
template<class vobj> template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs._grid); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ auto rhs_v = rhs.View();
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); auto ret_v = ret.View();
} accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
return ret; coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
} });
return ret;
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
}
return ret;
}
} }
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
});
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(ll(),rr()))>
{
typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop
ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]);
});
return ret;
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -0,0 +1,202 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
NAMESPACE_BEGIN(Grid);
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto Y_v = Y.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_loop_collapse2((int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
ComplexD z = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(z),imag(z));
}}
}});
thread_critical {
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
NAMESPACE_END(Grid);

View File

@ -1,138 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_overload.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_OVERLOAD_H
#define GRID_LATTICE_OVERLOAD_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// unary negation
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
Lattice<vobj> ret(r._grid);
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
vstream(ret._odata[ss], -r._odata[ss]);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////
// Lattice BinOp Lattice,
//NB mult performs conformable check. Do not reapply here for performance.
/////////////////////////////////////////////////////////////////////////////////////
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
mult(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
// Scalar BinOp Lattice ;generate return type
template<class left,class right>
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs*rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs+rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
}
return ret;
}
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]*rhs;
}
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]+rhs;
}
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
}
#endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -25,8 +25,8 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_PEEK_H #ifndef GRID_LATTICE_PEEK_H
#define GRID_LATTICE_PEEK_H #define GRID_LATTICE_PEEK_H
@ -34,172 +34,184 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
// Peeking and poking around // Peeking and poking around
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Peek internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
}
return ret;
};
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
{
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
ret.checkerboard=lhs.checkerboard;
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
}
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Poke internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
{
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
}
}
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
{
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
}
}
//////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array
//////////////////////////////////////////////////////
template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
// Optional to broadcast from node 0.
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s);
std::vector<sobj> buf(Nsimd);
// extract-modify-merge cycle is easiest way and this is not perf critical
if ( rank == grid->ThisRank() ) {
extract(l._odata[odx],buf);
buf[idx] = s;
merge(l._odata[odx],buf);
}
return;
};
////////////////////////////////////////////////////////// // FIXME accelerator_loop and accelerator_inline these
// Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////// // Peek internal indices of a Lattice object
template<class vobj,class sobj> ////////////////////////////////////////////////////////////////////////////////////////////////////
void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){ template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(vobj(),i))>
GridBase *grid=l._grid; {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
typedef typename vobj::scalar_type scalar_type; ret.Checkerboard()=lhs.Checkerboard();
typedef typename vobj::vector_type vector_type; auto ret_v = ret.View();
auto lhs_v = lhs.View();
int Nsimd = grid->Nsimd(); thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
assert( l.checkerboard == l._grid->CheckerBoard(site)); });
return ret;
int rank,odx,idx; };
grid->GlobalCoorToRankIndex(rank,odx,idx,site); template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(vobj(),i,j))>
std::vector<sobj> buf(Nsimd); {
extract(l._odata[odx],buf); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
s = buf[idx]; auto ret_v = ret.View();
auto lhs_v = lhs.View();
grid->Broadcast(rank,s); thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
return; });
}; return ret;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid = l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
}
return;
};
template<class vobj,class sobj>
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
}
return;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Poke internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
});
} }
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
});
}
//////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array
//////////////////////////////////////////////////////
template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
// Optional to broadcast from node 0.
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s);
// extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf);
buf[idx] = s;
merge(l_v[odx],buf);
}
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
extract(l_v[odx],buf);
s = buf[idx];
grid->Broadcast(rank,s);
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
GridBase *grid = l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
}
return;
};
template<class vobj,class sobj>
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
}
return;
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -25,8 +25,8 @@ Author: neo <cossu@post.kek.jp>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_REALITY_H #ifndef GRID_LATTICE_REALITY_H
#define GRID_LATTICE_REALITY_H #define GRID_LATTICE_REALITY_H
@ -36,22 +36,28 @@ Author: neo <cossu@post.kek.jp>
// The choice of burying complex in the SIMD // The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome // is making the use of "real" and "imag" very cumbersome
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid); Lattice<vobj> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto lhs_v = lhs.View();
ret._odata[ss] = adj(lhs._odata[ss]); auto ret_v = ret.View();
} accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
return ret; coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}; });
return ret;
};
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
});
return ret;
};
NAMESPACE_END(Grid);
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conjugate(lhs._odata[ss]);
}
return ret;
};
}
#endif #endif

View File

@ -19,22 +19,76 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_REDUCTION_H #pragma once
#define GRID_LATTICE_REDUCTION_H
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
namespace Grid {
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////// #ifdef GRID_NVCC
// Deterministic Reduction operations #include <Grid/lattice/Lattice_reduction_gpu.h>
//////////////////////////////////////////////////////////////////////////////////////////////////// #endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
// FIXME this should promote to double and accumulate
//////////////////////////////////////////////////////
template<class vobj>
inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto arg_v = arg.View();
Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
auto nrm = innerProduct(arg,arg); ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm); return real(nrm);
} }
// Double inner product // Double inner product
@ -43,32 +97,49 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
GridBase *grid = left._grid; ComplexD nrm;
const int pad = 8;
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
}
// All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
inner=0.0; GridBase *grid = left.Grid();
for(int i=0;i<grid->SumArraySize();i++){
inner = inner+sumarray[i*pad]; // Might make all code paths go this way.
} auto left_v = left.View();
right._grid->GlobalSum(inner); auto right_v=right.View();
return inner;
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
return nrm;
} }
///////////////////////// /////////////////////////
@ -86,8 +157,7 @@ axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj
template<class sobj,class vobj> strong_inline RealD template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
const int pad = 8; z.Checkerboard() = x.Checkerboard();
z.checkerboard = x.checkerboard;
conformable(z,x); conformable(z,x);
conformable(x,y); conformable(x,y);
@ -95,43 +165,57 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
RealD nrm; RealD nrm;
GridBase *grid = x._grid; GridBase *grid = x.Grid();
auto x_v=x.View();
auto y_v=y.View();
auto z_v=z.View();
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
Vector<RealD> sumarray(grid->SumArraySize()*pad); #ifdef GRID_NVCC
// GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ accelerator_for( ss, sites, nsimd,{
int nwork, mywork, myoff; auto tmp = a*x_v(ss)+b*y_v(ss);
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff); inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
// private to thread; sub summation });
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; // Already promoted to double
for(int ss=myoff;ss<mywork+myoff; ss++){ nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
vobj tmp = a*x._odata[ss]+b*y._odata[ss]; #endif
vnrm = vnrm + innerProductD(tmp,tmp); grid->GlobalSum(nrm);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
template<class Op,class T1,class T2> template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr) inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
@ -139,54 +223,14 @@ inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
template<class Op,class T1,class T2,class T3> template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), ->typename decltype(expr.op.func(eval(0,expr.arg1),
eval(0,std::get<1>(expr.second)), eval(0,expr.arg2),
eval(0,std::get<2>(expr.second)) eval(0,expr.arg3)
))::scalar_object ))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -199,7 +243,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields // But easily avoided by using double precision fields
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid; GridBase *grid = Data.Grid();
assert(grid!=NULL); assert(grid!=NULL);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
@ -212,13 +256,13 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first Vector<vobj> lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=zero; lvSum[r]=Zero();
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
@ -227,20 +271,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
parallel_for(int r=0;r<rd;r++){ auto Data_v=Data.View();
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data._odata[ss]; lvSum[r]=lvSum[r]+Data_v[ss];
} }
} }
} });
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd); Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
@ -265,7 +308,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
if ( pt == grid->_processor_coor[orthogdim] ) { if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt]; gsum=lsSum[lt];
} else { } else {
gsum=zero; gsum=Zero();
} }
grid->GlobalSum(gsum); grid->GlobalSum(gsum);
@ -292,9 +335,9 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
// std::cout << GridLogMessage << "Start prep" << std::endl; // std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid; GridBase *grid = lhs.Grid();
assert(grid!=NULL); assert(grid!=NULL);
conformable(grid,rhs._grid); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
@ -307,14 +350,14 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl; // std::cout << GridLogMessage << "Start alloc" << std::endl;
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first Vector<vector_type> lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type>> extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl; // std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=zero; lvSum[r]=Zero();
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
@ -323,23 +366,24 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
// std::cout << GridLogMessage << "End prep" << std::endl; // std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl; // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv; vector_type vv;
parallel_for(int r=0;r<rd;r++) auto l_v=lhs.View();
{ auto r_v=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss = so + n * stride + b; int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss])); vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
lvSum[r] = lvSum[r] + vv; lvSum[r] = lvSum[r] + vv;
} }
} }
} });
// std::cout << GridLogMessage << "End parallel inner product" << std::endl; // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd); Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp; iScalar<vector_type> temp;
@ -362,7 +406,7 @@ template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim) static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid; GridBase *grid = lhs.Grid();
int fd = result.size(); int fd = result.size();
int ld = lsSum.size(); int ld = lsSum.size();
// sum over nodes. // sum over nodes.
@ -388,9 +432,9 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid; GridBase *grid = lhs.Grid();
assert(grid!=NULL); assert(grid!=NULL);
conformable(grid,rhs._grid); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
@ -402,34 +446,36 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first Vector<vector_type> lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=zero; lvSum[r]=Zero();
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
parallel_for(int r=0;r<rd;r++){ auto lhv=lhs.View();
auto rhv=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss]));
lvSum[r]=lvSum[r]+vv; lvSum[r]=lvSum[r]+vv;
} }
} }
} });
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd); Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp; iScalar<vector_type> temp;
@ -470,7 +516,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog]; int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock); std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock); sn.resize(Nblock);
@ -492,7 +538,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
scalar_type zscale(scale); scalar_type zscale(scale);
GridBase *grid = X._grid; GridBase *grid = X.Grid();
int Nsimd =grid->Nsimd(); int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim]; int Nblock =grid->GlobalDimensions()[orthogdim];
@ -505,8 +551,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
int e2 =grid->_slice_block [orthogdim]; int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim]; int stride =grid->_slice_stride[orthogdim];
std::vector<int> icoor; Coordinate icoor;
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@ -522,12 +567,15 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
parallel_for_nest2(int n=0;n<e1;n++){ auto Rv=R.View();
auto Xv=X.View();
auto Yv=Y.View();
thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
R._odata[ss] = at*X._odata[ss]+Y._odata[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
} }
} });
} }
}; };
@ -559,18 +607,18 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid; GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl = nh-1; // int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -578,28 +626,31 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2) auto X_v=X.View();
for(int n=0;n<nblock;n++){ auto Y_v=Y.View();
for(int b=0;b<block;b++){ auto R_v=R.View();
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride]; s_x[i] = X_v[o+i*ostride];
} }
vobj dot; vobj dot;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
dot = Y[o+i*ostride]; dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){ for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i)); dot = dot + s_x[j]*(scale*aa(j,i));
} }
R[o+i*ostride]=dot; R_v[o+i*ostride]=dot;
} }
}} }});
} }
}; };
@ -610,17 +661,17 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid; GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -628,17 +679,19 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel auto R_v = R.View();
auto X_v = X.View();
thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){ thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride]; s_x[i] = X_v[o+i*ostride];
} }
vobj dot; vobj dot;
@ -647,11 +700,10 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
for(int j=1;j<Nblock;j++){ for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i)); dot = dot + s_x[j]*(scale*aa(j,i));
} }
R[o+i*ostride]=dot; R_v[o+i*ostride]=dot;
} }
}} }});
} }
}; };
@ -662,7 +714,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid; GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog]; int Nblock = FullGrid->GlobalDimensions()[Orthog];
@ -673,9 +725,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl = nh-1; // int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -686,31 +738,33 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
#pragma omp parallel auto lhs_v=lhs.View();
auto rhs_v=rhs.View();
thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
#pragma omp for collapse(2) thread_for_collapse_in_region( 2, n,nblock,{
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
Left [i] = lhs[o+i*ostride]; Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs[o+i*ostride]; Right[i] = rhs_v[o+i*ostride];
} }
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){ for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]); auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp); auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp); auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}} }}
}} }});
#pragma omp critical thread_critical
{ {
mat += mat_thread; mat += mat_thread;
} }
@ -726,8 +780,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
return; return;
} }
} /*END NAMESPACE GRID*/ NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,226 @@
NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32
extern cudaDeviceProp *gpu_props;
__device__ unsigned int retirementCount = 0;
template <class Iterator>
unsigned int nextPow2(Iterator x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
cudaGetDevice(&device);
Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
}
// let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize;
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
}
template <class sobj, class Iterator>
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
Iterator blockSize = blockDim.x;
// cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp();
const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1);
sobj beta, temp;
memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
for (int i = VEC/2; i > 0; i>>=1) {
if (vid < i) {
memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
}
__syncwarp();
}
__syncthreads();
if (threadIdx.x == 0) {
beta = Zero();
for (Iterator i = 0; i < blockSize; i += VEC) {
memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
beta += temp;
}
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
}
__syncthreads();
}
template <class vobj, class sobj, class Iterator>
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
{
constexpr Iterator nsimd = vobj::Nsimd();
Iterator blockSize = blockDim.x;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *sdata = (sobj *)shmem_pointer;
// first level of reduction,
// each thread writes result in mySum
Iterator tid = threadIdx.x;
Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
Iterator gridSize = blockSize*2*gridDim.x;
sobj mySum = Zero();
while (i < n) {
Iterator lane = i % nsimd;
Iterator ss = i / nsimd;
auto tmp = extractLane(lane,g_idata[ss]);
sobj tmpD;
tmpD=tmp;
mySum +=tmpD;
if (i + blockSize < n) {
lane = (i+blockSize) % nsimd;
ss = (i+blockSize) / nsimd;
tmp = extractLane(lane,g_idata[ss]);
tmpD = tmp;
mySum += tmpD;
}
i += gridSize;
}
// copy mySum to shared memory and perform
// reduction for all threads in this block
reduceBlock(sdata, mySum, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class vobj, class sobj,class Iterator>
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
Iterator blockSize = blockDim.x;
// perform reduction for this block and
// write result to global memory buffer
reduceBlocks(lat, buffer, n);
if (gridDim.x > 1) {
const Iterator tid = threadIdx.x;
__shared__ bool amLast;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished
__threadfence();
if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
// true if this block is the last block to be done
amLast = (ticket == gridDim.x-1);
}
// each thread must read the correct value of amLast
__syncthreads();
if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1]
Iterator i = tid;
sobj mySum = Zero();
while (i < gridDim.x) {
mySum += buffer[i];
i += blockSize;
}
reduceBlock(smem, mySum, tid);
if (tid==0) {
buffer[0] = smem[0];
// reset count variable
retirementCount = 0;
}
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0];
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +24,8 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_RNG_H #ifndef GRID_LATTICE_RNG_H
#define GRID_LATTICE_RNG_H #define GRID_LATTICE_RNG_H
@ -41,282 +41,289 @@
#undef RNG_FAST_DISCARD #undef RNG_FAST_DISCARD
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid // Allow the RNG state to be less dense than the fine grid
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
inline int RNGfillable(GridBase *coarse,GridBase *fine) inline int RNGfillable(GridBase *coarse,GridBase *fine)
{ {
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0); assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){ for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1); assert(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1); assert(fine->_processors[d]==1);
}
int multiplicity=1;
for(int d=0;d<lowerdims;d++){
multiplicity=multiplicity*fine->_rdimensions[d];
}
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
} }
int multiplicity=1;
for(int d=0;d<lowerdims;d++){
multiplicity=multiplicity*fine->_rdimensions[d];
}
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
}
// merge of April 11 2017 // merge of April 11 2017
// this function is necessary for the LS vectorised field // this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine) inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{ {
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0); int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors // assumes that the higher dimensions are not using more processors
// all further divisions are local // all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1); for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]); for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites // then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same // check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd()); assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly // check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites(); return fine->lSites() / coarse->lSites();
} }
// real scalars are one component // real scalars are one component
template<class scalar,class distribution,class generator> template<class scalar,class distribution,class generator>
void fillScalar(scalar &s,distribution &dist,generator & gen) void fillScalar(scalar &s,distribution &dist,generator & gen)
{ {
s=dist(gen); s=dist(gen);
} }
template<class distribution,class generator> template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen) void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{ {
s=ComplexF(dist(gen),dist(gen)); // s=ComplexF(dist(gen),dist(gen));
} s.real(dist(gen));
template<class distribution,class generator> s.imag(dist(gen));
void fillScalar(ComplexD &s,distribution &dist,generator &gen) }
{ template<class distribution,class generator>
s=ComplexD(dist(gen),dist(gen)); void fillScalar(ComplexD &s,distribution &dist,generator &gen)
} {
// s=ComplexD(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
}
class GridRNGbase { class GridRNGbase {
public: public:
// One generator per site. // One generator per site.
// Uniform and Gaussian distributions from these generators. // Uniform and Gaussian distributions from these generators.
#ifdef RNG_RANLUX #ifdef RNG_RANLUX
typedef std::ranlux48 RngEngine; typedef std::ranlux48 RngEngine;
typedef uint64_t RngStateType; typedef uint64_t RngStateType;
static const int RngStateCount = 15; static const int RngStateCount = 15;
#endif #endif
#ifdef RNG_MT19937 #ifdef RNG_MT19937
typedef std::mt19937 RngEngine; typedef std::mt19937 RngEngine;
typedef uint32_t RngStateType; typedef uint32_t RngStateType;
static const int RngStateCount = std::mt19937::state_size; static const int RngStateCount = std::mt19937::state_size;
#endif #endif
#ifdef RNG_SITMO #ifdef RNG_SITMO
typedef sitmo::prng_engine RngEngine; typedef sitmo::prng_engine RngEngine;
typedef uint64_t RngStateType; typedef uint64_t RngStateType;
static const int RngStateCount = 13; static const int RngStateCount = 13;
#endif #endif
std::vector<RngEngine> _generators; std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform; std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian; std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli; std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid; std::vector<std::uniform_int_distribution<uint32_t> > _uid;
/////////////////////// ///////////////////////
// support for parallel init // support for parallel init
/////////////////////// ///////////////////////
#ifdef RNG_FAST_DISCARD #ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site) static void Skip(RngEngine &eng,uint64_t site)
{ {
///////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites // Skip by 2^40 elements between successive lattice sites
// This goes by 10^12. // This goes by 10^12.
// Consider quenched updating; likely never exceeding rate of 1000 sweeps // Consider quenched updating; likely never exceeding rate of 1000 sweeps
// per second on any machine. This gives us of order 10^9 seconds, or 100 years // per second on any machine. This gives us of order 10^9 seconds, or 100 years
// skip ahead. // skip ahead.
// For HMC unlikely to go at faster than a solve per second, and // For HMC unlikely to go at faster than a solve per second, and
// tens of seconds per trajectory so this is clean in all reasonable cases, // tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude. // and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary // We could hack Sitmo to skip in the higher order words of state if necessary
// //
// Replace with 2^30 ; avoid problem on large volumes // Replace with 2^30 ; avoid problem on large volumes
// //
///////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init // uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30; const int shift = 30;
uint64_t skip = site; ////////////////////////////////////////////////////////////////////
// Weird compiler bug in Intel 2018.1 under O3 was generating 32bit and not 64 bit left shift.
////////////////////////////////////////////////////////////////////
volatile uint64_t skip = site;
skip = skip<<shift; skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow assert((skip >> shift)==site); // check for overflow
eng.discard(skip); eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl; // std::cout << " Engine " <<site << " state " <<eng<<std::endl;
} }
#endif #endif
static RngEngine Reseed(RngEngine &eng) static RngEngine Reseed(RngEngine &eng)
{ {
std::vector<uint32_t> newseed; std::vector<uint32_t> newseed;
std::uniform_int_distribution<uint32_t> uid; std::uniform_int_distribution<uint32_t> uid;
return Reseed(eng,newseed,uid); return Reseed(eng,newseed,uid);
} }
static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed, static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
std::uniform_int_distribution<uint32_t> &uid) std::uniform_int_distribution<uint32_t> &uid)
{ {
const int reseeds=4; const int reseeds=4;
newseed.resize(reseeds); newseed.resize(reseeds);
for(int i=0;i<reseeds;i++){ for(int i=0;i<reseeds;i++){
newseed[i] = uid(eng); newseed[i] = uid(eng);
} }
std::seed_seq sseq(newseed.begin(),newseed.end()); std::seed_seq sseq(newseed.begin(),newseed.end());
return RngEngine(sseq); return RngEngine(sseq);
} }
void GetState(std::vector<RngStateType> & saved,RngEngine &eng) { void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
saved.resize(RngStateCount); saved.resize(RngStateCount);
std::stringstream ss; std::stringstream ss;
ss<<eng; ss<<eng;
ss.seekg(0,ss.beg); ss.seekg(0,ss.beg);
for(int i=0;i<RngStateCount;i++){ for(int i=0;i<RngStateCount;i++){
ss>>saved[i]; ss>>saved[i];
}
} }
void GetState(std::vector<RngStateType> & saved,int gen) { }
GetState(saved,_generators[gen]); void GetState(std::vector<RngStateType> & saved,int gen) {
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
} }
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ ss.seekg(0,ss.beg);
assert(saved.size()==RngStateCount); ss>>eng;
std::stringstream ss; }
for(int i=0;i<RngStateCount;i++){ void SetState(std::vector<RngStateType> & saved,int gen){
ss<< saved[i]<<" "; SetState(saved,_generators[gen]);
} }
ss.seekg(0,ss.beg); void SetEngine(RngEngine &Eng, int gen){
ss>>eng; _generators[gen]=Eng;
} }
void SetState(std::vector<RngStateType> & saved,int gen){ void GetEngine(RngEngine &Eng, int gen){
SetState(saved,_generators[gen]); Eng=_generators[gen];
} }
void SetEngine(RngEngine &Eng, int gen){ template<class source> void Seed(source &src, int gen)
_generators[gen]=Eng; {
} _generators[gen] = RngEngine(src);
void GetEngine(RngEngine &Eng, int gen){ }
Eng=_generators[gen]; };
}
template<class source> void Seed(source &src, int gen)
{
_generators[gen] = RngEngine(src);
}
};
class GridSerialRNG : public GridRNGbase { class GridSerialRNG : public GridRNGbase {
public: public:
GridSerialRNG() : GridRNGbase() { GridSerialRNG() : GridRNGbase() {
_generators.resize(1); _generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) ); _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() ); _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
} }
template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){ template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
typedef typename sobj::scalar_type scalar_type; typedef typename sobj::scalar_type scalar_type;
int words = sizeof(sobj)/sizeof(scalar_type); int words = sizeof(sobj)/sizeof(scalar_type);
scalar_type *buf = (scalar_type *) & l; scalar_type *buf = (scalar_type *) & l;
dist[0].reset(); dist[0].reset();
for(int idx=0;idx<words;idx++){ for(int idx=0;idx<words;idx++){
fillScalar(buf[idx],dist[0],_generators[0]); fillScalar(buf[idx],dist[0],_generators[0]);
} }
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}; }
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){ template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset(); dist[0].reset();
fillScalar(l,dist[0],_generators[0]); fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
// vector fill
template <class distribution> inline void fill(vComplexF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
} }
template <class distribution> inline void fill(ComplexD &l,std::vector<distribution> &dist){ CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
dist[0].reset(); }
fillScalar(l,dist[0],_generators[0]); template <class distribution> inline void fill(vComplexD &l,std::vector<distribution> &dist){
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
} }
template <class distribution> inline void fill(RealF &l,std::vector<distribution> &dist){ CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
dist[0].reset(); }
fillScalar(l,dist[0],_generators[0]); template <class distribution> inline void fill(vRealF &l,std::vector<distribution> &dist){
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
} }
template <class distribution> inline void fill(RealD &l,std::vector<distribution> &dist){ CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
dist[0].reset(); }
fillScalar(l,dist[0],_generators[0]); template <class distribution> inline void fill(vRealD &l,std::vector<distribution> &dist){
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); RealD *pointer=(RealD *)&l;
} dist[0].reset();
// vector fill for(int i=0;i<vRealD::Nsimd();i++){
template <class distribution> inline void fill(vComplexF &l,std::vector<distribution> &dist){ fillScalar(pointer[i],dist[0],_generators[0]);
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vComplexD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<vRealD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
} }
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
void SeedFixedIntegers(const std::vector<int> &seeds){ void SeedFixedIntegers(const std::vector<int> &seeds){
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq src(seeds.begin(),seeds.end()); std::seed_seq src(seeds.begin(),seeds.end());
Seed(src,0); Seed(src,0);
} }
void SeedUniqueString(const std::string &s){ void SeedUniqueString(const std::string &s){
std::vector<int> seeds; std::vector<int> seeds;
@ -330,65 +337,67 @@ namespace Grid {
std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl; std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
SeedFixedIntegers(seeds); SeedFixedIntegers(seeds);
} }
}; };
class GridParallelRNG : public GridRNGbase { class GridParallelRNG : public GridRNGbase {
private:
double _time_counter;
GridBase *_grid;
unsigned int _vol;
double _time_counter; public:
GridBase *Grid(void) const { return _grid; }
int generator_idx(int os,int is) {
return is*_grid->oSites()+os;
}
public: GridParallelRNG(GridBase *grid) : GridRNGbase() {
GridBase *_grid; _grid = grid;
unsigned int _vol; _vol =_grid->iSites()*_grid->oSites();
int generator_idx(int os,int is) { _generators.resize(_vol);
return is*_grid->oSites()+os; _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
} _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
GridParallelRNG(GridBase *grid) : GridRNGbase() { template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
_grid = grid;
_vol =_grid->iSites()*_grid->oSites();
_generators.resize(_vol); typedef typename vobj::scalar_object scalar_object;
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); typedef typename vobj::scalar_type scalar_type;
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) ); typedef typename vobj::vector_type vector_type;
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){ double inner_time_counter = usecond();
typedef typename vobj::scalar_object scalar_object; int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
typedef typename vobj::scalar_type scalar_type; int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
typedef typename vobj::vector_type vector_type; int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
double inner_time_counter = usecond(); auto l_v = l.View();
thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid int sm = multiplicity * ss + m; // Maps the generator site to the fine site
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l._grid too
int osites = _grid->oSites(); // guaranteed to be <= l._grid->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
parallel_for(int ss=0;ss<osites;ss++){ for (int si = 0; si < Nsimd; si++) {
std::vector<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
for (int si = 0; si < Nsimd; si++) {
int gdx = generator_idx(ss, si); // index of generator state int gdx = generator_idx(ss, si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si]; scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset(); dist[gdx].reset();
for (int idx = 0; idx < words; idx++) for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]); fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
} }
// merge into SIMD lanes, FIXME suboptimal implementation // merge into SIMD lanes, FIXME suboptimal implementation
merge(l._odata[sm], buf); merge(l_v[sm], buf);
}
} }
});
// });
_time_counter += usecond()- inner_time_counter; _time_counter += usecond()- inner_time_counter;
}; }
void SeedUniqueString(const std::string &s){ void SeedUniqueString(const std::string &s){
std::vector<int> seeds; std::vector<int> seeds;
@ -398,119 +407,119 @@ namespace Grid {
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds); SeedFixedIntegers(seeds);
} }
void SeedFixedIntegers(const std::vector<int> &seeds){ void SeedFixedIntegers(const std::vector<int> &seeds){
// Everyone generates the same seed_seq based on input seeds // Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq source(seeds.begin(),seeds.end()); std::seed_seq source(seeds.begin(),seeds.end());
RngEngine master_engine(source); RngEngine master_engine(source);
#ifdef RNG_FAST_DISCARD #ifdef RNG_FAST_DISCARD
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Skip ahead through a single stream. // Skip ahead through a single stream.
// Applicable to SITMO and other has based/crypto RNGs // Applicable to SITMO and other has based/crypto RNGs
// Should be applicable to Mersenne Twister, but the C++11 // Should be applicable to Mersenne Twister, but the C++11
// MT implementation does not implement fast discard even though // MT implementation does not implement fast discard even though
// in principle this is possible // in principle this is possible
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Everybody loops over global volume.
parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it? // Where is it?
int rank,o_idx,i_idx; int rank;
std::vector<int> gcoor; int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it // If this is one of mine we take it
if( rank == _grid->ThisRank() ){ if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx); int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine; _generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
} }
});
}
#else #else
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient // Machine and thread decomposition dependent seeding is efficient
// and maximally parallel; but NOT reproducible from machine to machine. // and maximally parallel; but NOT reproducible from machine to machine.
// Not ideal, but fastest way to reseed all nodes. // Not ideal, but fastest way to reseed all nodes.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
{ {
// Obtain one Reseed per processor // Obtain one Reseed per processor
int Nproc = _grid->ProcessorCount(); int Nproc = _grid->ProcessorCount();
std::vector<RngEngine> seeders(Nproc); std::vector<RngEngine> seeders(Nproc);
int me= _grid->ThisRank(); int me= _grid->ThisRank();
for(int p=0;p<Nproc;p++){ for(int p=0;p<Nproc;p++){
seeders[p] = Reseed(master_engine); seeders[p] = Reseed(master_engine);
} }
master_engine = seeders[me]; master_engine = seeders[me];
}
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);
} }
{ thread_for( t, Nthread, {
// Obtain one reseeded generator per thread // set up one per local site in threaded fashion
int Nthread = GridThread::GetThreads(); std::vector<uint32_t> newseeds;
std::vector<RngEngine> seeders(Nthread); std::uniform_int_distribution<uint32_t> uid;
for(int t=0;t<Nthread;t++){ for(int l=0;l<_grid->lSites();l++) {
seeders[t] = Reseed(master_engine); if ( (l%Nthread)==t ) {
} _generators[l] = Reseed(seeders[t],newseeds,uid);
parallel_for(int t=0;t<Nthread;t++) {
// set up one per local site in threaded fashion
std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid;
for(int l=0;l<_grid->lSites();l++) {
if ( (l%Nthread)==t ) {
_generators[l] = Reseed(seeders[t],newseeds,uid);
}
} }
} }
} });
}
#endif #endif
}
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
uint32_t the_number;
// who
int rank,o_idx,i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
} }
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
uint32_t the_number;
// who
std::vector<int> gcoor;
int rank,o_idx,i_idx;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
}
// share & return // share & return
_grid->Broadcast(rank,(void *)&the_number,sizeof(the_number)); _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
return the_number; return the_number;
} }
}; };
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); } template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); } template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);} template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); } template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,8 +23,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_TRACE_H #ifndef GRID_LATTICE_TRACE_H
#define GRID_LATTICE_TRACE_H #define GRID_LATTICE_TRACE_H
@ -32,36 +32,38 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Tracing, transposing, peeking, poking // Tracing, transposing, peeking, poking
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace // Trace
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
-> Lattice<decltype(trace(lhs._odata[0]))> {
{ Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid); auto ret_v = ret.View();
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto lhs_v = lhs.View();
ret._odata[ss] = trace(lhs._odata[ss]); accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
} coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
return ret; });
}; return ret;
};
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation // Trace Index level dependent operation
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]); auto lhs_v = lhs.View();
} accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
return ret; coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
}; });
return ret;
};
NAMESPACE_END(Grid);
}
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_TRANSPOSE_H #ifndef GRID_LATTICE_TRANSPOSE_H
#define GRID_LATTICE_TRANSPOSE_H #define GRID_LATTICE_TRANSPOSE_H
@ -33,31 +33,36 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Transpose // Transpose
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose // Transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid); Lattice<vobj> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = transpose(lhs._odata[ss]); auto lhs_v = lhs.View();
} accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
return ret; coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
}; });
return ret;
};
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose // Index level dependent transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]); auto lhs_v = lhs.View();
} accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
return ret; coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
}; });
} return ret;
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -26,59 +26,55 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_UNARY_H #ifndef GRID_LATTICE_UNARY_H
#define GRID_LATTICE_UNARY_H #define GRID_LATTICE_UNARY_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=pow(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=mod(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=div(rhs._odata[ss],y);
}
return ret;
}
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret(rhs._grid);
ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
}
return ret;
}
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y);
});
return ret_i;
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y));
});
return ret_i;
}
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View();
auto rhs = rhs_i.View();
ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y));
});
return ret_i;
}
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
});
return ret_i;
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -28,27 +28,27 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/util/CompilerCompatible.h> #include <Grid/util/CompilerCompatible.h>
#include <cxxabi.h> #include <cxxabi.h>
#include <memory> #include <memory>
namespace Grid { NAMESPACE_BEGIN(Grid);
std::string demangle(const char* name) { std::string demangle(const char* name) {
int status = -4; // some arbitrary value to eliminate the compiler warning int status = -4; // some arbitrary value to eliminate the compiler warning
// enable c++11 by passing the flag -std=c++11 to g++ // enable c++11 by passing the flag -std=c++11 to g++
std::unique_ptr<char, void(*)(void*)> res { std::unique_ptr<char, void(*)(void*)> res {
abi::__cxa_demangle(name, NULL, NULL, &status), abi::__cxa_demangle(name, NULL, NULL, &status),
std::free std::free
}; };
return (status==0) ? res.get() : name ; return (status==0) ? res.get() : name ;
} }
GridStopWatch Logger::GlobalStopWatch; GridStopWatch Logger::GlobalStopWatch;
int Logger::timestamp; int Logger::timestamp;
@ -109,8 +109,9 @@ void Grid_quiesce_nodes(void) {
} }
void Grid_unquiesce_nodes(void) { void Grid_unquiesce_nodes(void) {
#ifdef GRID_COMMS_MPI #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
std::cout.clear(); std::cout.clear();
#endif #endif
} }
} NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -25,8 +25,8 @@
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <map> #include <map>
@ -37,13 +37,12 @@
#include <execinfo.h> #include <execinfo.h>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class // Dress the output; use std::chrono for time stamping via the StopWatch class
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{ class Colours{
protected: protected:
bool is_active; bool is_active;
@ -57,15 +56,15 @@ public:
void Active(bool activate){ void Active(bool activate){
is_active=activate; is_active=activate;
if (is_active){ if (is_active){
colour["BLACK"] ="\033[30m"; colour["BLACK"] ="\033[30m";
colour["RED"] ="\033[31m"; colour["RED"] ="\033[31m";
colour["GREEN"] ="\033[32m"; colour["GREEN"] ="\033[32m";
colour["YELLOW"] ="\033[33m"; colour["YELLOW"] ="\033[33m";
colour["BLUE"] ="\033[34m"; colour["BLUE"] ="\033[34m";
colour["PURPLE"] ="\033[35m"; colour["PURPLE"] ="\033[35m";
colour["CYAN"] ="\033[36m"; colour["CYAN"] ="\033[36m";
colour["WHITE"] ="\033[37m"; colour["WHITE"] ="\033[37m";
colour["NORMAL"] ="\033[0;39m"; colour["NORMAL"] ="\033[0;39m";
} else { } else {
colour["BLACK"] =""; colour["BLACK"] ="";
colour["RED"] =""; colour["RED"] ="";
@ -102,14 +101,14 @@ public:
std::string colour() {return Painter.colour[COLOUR];} std::string colour() {return Painter.colour[COLOUR];}
Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on), Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on),
name(nm), name(nm),
topName(topNm), topName(topNm),
Painter(col_class), Painter(col_class),
timing_mode(0), timing_mode(0),
COLOUR(col) COLOUR(col)
{ {
StopWatch = & GlobalStopWatch; StopWatch = & GlobalStopWatch;
}; };
void Active(int on) {active = on;}; void Active(int on) {active = on;};
int isActive(void) {return active;}; int isActive(void) {return active;};
@ -164,7 +163,7 @@ public:
class GridLogger: public Logger { class GridLogger: public Logger {
public: public:
GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"): GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
Logger("Grid", on, nm, col_class, col_key){}; Logger("Grid", on, nm, col_class, col_key){};
}; };
void GridLogConfigure(std::vector<std::string> &logstreams); void GridLogConfigure(std::vector<std::string> &logstreams);
@ -181,39 +180,39 @@ extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ; extern GridLogger GridLogIntegrator ;
extern Colours GridLogColours; extern Colours GridLogColours;
std::string demangle(const char* name) ; std::string demangle(const char* name) ;
#define _NBACKTRACE (256) #define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE]; extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() {\ #define BACKTRACEFILE() { \
char string[20]; \ char string[20]; \
std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \ std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
std::FILE * fp = std::fopen(string,"w"); \ std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp)\ BACKTRACEFP(fp) \
std::fclose(fp); \ std::fclose(fp); \
} }
#ifdef HAVE_EXECINFO_H #ifdef HAVE_EXECINFO_H
#define BACKTRACEFP(fp) { \ #define BACKTRACEFP(fp) { \
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);\ int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE); \
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\ char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols); \
for (int i = 0; i < symbols; i++){\ for (int i = 0; i < symbols; i++){ \
std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \ std::fprintf (fp,"BackTrace Strings: %d %s\n",i, demangle(strings[i]).c_str()); std::fflush(fp); \
}\ } \
} }
#else #else
#define BACKTRACEFP(fp) { \ #define BACKTRACEFP(fp) { \
std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \ std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \ std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \ std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \ std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
} }
#endif #endif
#define BACKTRACE() BACKTRACEFP(stdout) #define BACKTRACE() BACKTRACEFP(stdout)
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -26,8 +26,7 @@
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_BINARY_IO_H #pragma once
#define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO #define USE_MPI_IO
@ -42,8 +41,7 @@
#include <arpa/inet.h> #include <arpa/inet.h>
#include <algorithm> #include <algorithm>
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Byte reversal garbage // Byte reversal garbage
@ -91,7 +89,7 @@ class BinaryIO {
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid; GridBase *grid = lat.Grid();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
@ -111,21 +109,20 @@ class BinaryIO {
lsites = 1; lsites = 1;
} }
PARALLEL_REGION thread_region
{ {
uint32_t nersc_csum_thr = 0; uint32_t nersc_csum_thr = 0;
PARALLEL_FOR_LOOP_INTERN thread_for_in_region( local_site, lsites,
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{ {
uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++) for (uint64_t j = 0; j < size32; j++)
{ {
nersc_csum_thr = nersc_csum_thr + site_buf[j]; nersc_csum_thr = nersc_csum_thr + site_buf[j];
} }
} });
PARALLEL_CRITICAL thread_critical
{ {
nersc_csum += nersc_csum_thr; nersc_csum += nersc_csum_thr;
} }
@ -134,28 +131,25 @@ PARALLEL_CRITICAL
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{ {
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
int nd = grid->_ndimension; int nd = grid->_ndimension;
uint64_t lsites =grid->lSites(); uint64_t lsites =grid->lSites();
if (fbuf.size()==1) { if (fbuf.size()==1) {
lsites=1; lsites=1;
} }
std::vector<int> local_vol =grid->LocalDimensions(); Coordinate local_vol =grid->LocalDimensions();
std::vector<int> local_start =grid->LocalStarts(); Coordinate local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions(); Coordinate global_vol =grid->FullDimensions();
PARALLEL_REGION thread_region
{ {
std::vector<int> coor(nd); Coordinate coor(nd);
uint32_t scidac_csuma_thr=0; uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0; uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0; uint32_t site_crc=0;
PARALLEL_FOR_LOOP_INTERN thread_for_in_region( local_site, lsites,
for(uint64_t local_site=0;local_site<lsites;local_site++){ {
uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
@ -182,9 +176,9 @@ PARALLEL_FOR_LOOP_INTERN
// std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl; // std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29); scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31); scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
} });
PARALLEL_CRITICAL thread_critical
{ {
scidac_csuma^= scidac_csuma_thr; scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr; scidac_csumb^= scidac_csumb_thr;
@ -202,9 +196,9 @@ PARALLEL_CRITICAL
{ {
uint32_t * f = (uint32_t *)file_object; uint32_t * f = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t); uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for( i, count, {
f[i] = ntohl(f[i]); f[i] = ntohl(f[i]);
} });
} }
// LE must Swap and switch to host // LE must Swap and switch to host
static inline void le32toh_v(void *file_object,uint64_t bytes) static inline void le32toh_v(void *file_object,uint64_t bytes)
@ -212,13 +206,13 @@ PARALLEL_CRITICAL
uint32_t *fp = (uint32_t *)file_object; uint32_t *fp = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t); uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for(i,count,{
uint32_t f; uint32_t f;
f = fp[i]; f = fp[i];
// got network order and the network to host // got network order and the network to host
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = ntohl(f); fp[i] = ntohl(f);
} });
} }
// BE is same as network // BE is same as network
@ -226,9 +220,9 @@ PARALLEL_CRITICAL
{ {
uint64_t * f = (uint64_t *)file_object; uint64_t * f = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t); uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for( i, count, {
f[i] = Grid_ntohll(f[i]); f[i] = Grid_ntohll(f[i]);
} });
} }
// LE must swap and switch; // LE must swap and switch;
@ -236,7 +230,7 @@ PARALLEL_CRITICAL
{ {
uint64_t *fp = (uint64_t *)file_object; uint64_t *fp = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t); uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for( i, count, {
uint64_t f,g; uint64_t f,g;
f = fp[i]; f = fp[i];
// got network order and the network to host // got network order and the network to host
@ -245,7 +239,7 @@ PARALLEL_CRITICAL
f = f >> 32; f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = Grid_ntohll(g); fp[i] = Grid_ntohll(g);
} });
} }
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Real action: // Real action:
@ -281,13 +275,13 @@ PARALLEL_CRITICAL
int nrank = grid->ProcessorCount(); int nrank = grid->ProcessorCount();
int myrank = grid->ThisRank(); int myrank = grid->ThisRank();
std::vector<int> psizes = grid->ProcessorGrid(); Coordinate psizes = grid->ProcessorGrid();
std::vector<int> pcoor = grid->ThisProcessorCoor(); Coordinate pcoor = grid->ThisProcessorCoor();
std::vector<int> gLattice= grid->GlobalDimensions(); Coordinate gLattice= grid->GlobalDimensions();
std::vector<int> lLattice= grid->LocalDimensions(); Coordinate lLattice= grid->LocalDimensions();
std::vector<int> lStart(ndim); Coordinate lStart(ndim);
std::vector<int> gStart(ndim); Coordinate gStart(ndim);
// Flatten the file // Flatten the file
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@ -546,7 +540,7 @@ PARALLEL_CRITICAL
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
@ -558,7 +552,7 @@ PARALLEL_CRITICAL
GridStopWatch timer; GridStopWatch timer;
timer.Start(); timer.Start();
parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]); thread_for(x,lsites, { munge(iodata[x], scalardata[x]); });
vectorizeFromLexOrdArray(scalardata,Umu); vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier(); grid->Barrier();
@ -582,7 +576,7 @@ PARALLEL_CRITICAL
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites(), offsetCopy = offset; uint64_t lsites = grid->lSites(), offsetCopy = offset;
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry); int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0); bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
@ -596,7 +590,7 @@ PARALLEL_CRITICAL
GridStopWatch timer; timer.Start(); GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu); unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]); thread_for(x, lsites, { munge(scalardata[x],iodata[x]); });
grid->Barrier(); grid->Barrier();
timer.Stop(); timer.Stop();
@ -619,7 +613,7 @@ PARALLEL_CRITICAL
{ {
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl; std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
offset = offsetCopy; offset = offsetCopy;
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]); thread_for(x,lsites, { munge(scalardata[x],iodata[x]); });
} }
else else
{ {
@ -637,8 +631,8 @@ PARALLEL_CRITICAL
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Read a RNG; use IOobject and lexico map to an array of state // Read a RNG; use IOobject and lexico map to an array of state
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static inline void readRNG(GridSerialRNG &serial, static inline void readRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel, GridParallelRNG &parallel_rng,
std::string file, std::string file,
uint64_t offset, uint64_t offset,
uint32_t &nersc_csum, uint32_t &nersc_csum,
@ -652,7 +646,7 @@ PARALLEL_CRITICAL
std::string format = "IEEE32BIG"; std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid; GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites(); uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@ -669,11 +663,11 @@ PARALLEL_CRITICAL
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx); parallel_rng.SetState(tmp,lidx);
} });
timer.Stop(); timer.Stop();
iodata.resize(1); iodata.resize(1);
@ -683,7 +677,7 @@ PARALLEL_CRITICAL
{ {
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin()); std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
serial.SetState(tmp,0); serial_rng.SetState(tmp,0);
} }
nersc_csum = nersc_csum + nersc_csum_tmp; nersc_csum = nersc_csum + nersc_csum_tmp;
@ -699,8 +693,8 @@ PARALLEL_CRITICAL
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Write a RNG; lexico map to an array of state and use IOobject // Write a RNG; lexico map to an array of state and use IOobject
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static inline void writeRNG(GridSerialRNG &serial, static inline void writeRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel, GridParallelRNG &parallel_rng,
std::string file, std::string file,
uint64_t offset, uint64_t offset,
uint32_t &nersc_csum, uint32_t &nersc_csum,
@ -712,7 +706,7 @@ PARALLEL_CRITICAL
const int RngStateCount = GridSerialRNG::RngStateCount; const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate; typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid; GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites(); uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@ -727,11 +721,11 @@ PARALLEL_CRITICAL
timer.Start(); timer.Start();
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx); parallel_rng.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
} });
timer.Stop(); timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
@ -739,7 +733,7 @@ PARALLEL_CRITICAL
iodata.resize(1); iodata.resize(1);
{ {
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
serial.GetState(tmp,0); serial_rng.GetState(tmp,0);
std::copy(tmp.begin(),tmp.end(),iodata[0].begin()); std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
} }
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
@ -756,5 +750,4 @@ PARALLEL_CRITICAL
} }
}; };
} NAMESPACE_END(Grid);
#endif

View File

@ -24,8 +24,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ILDG_IO_H #pragma once
#define GRID_ILDG_IO_H
#ifdef HAVE_LIME #ifdef HAVE_LIME
#include <algorithm> #include <algorithm>
@ -43,8 +42,7 @@ extern "C" {
#include "lime.h" #include "lime.h"
} }
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
#define GRID_FIELD_NORM "FieldNormMetaData" #define GRID_FIELD_NORM "FieldNormMetaData"
#define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \ #define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \
@ -140,7 +138,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
///////////////////////////////////// /////////////////////////////////////
// Scidac Private File structure // Scidac Private File structure
///////////////////////////////////// /////////////////////////////////////
_scidacFile = scidacFile(field._grid); _scidacFile = scidacFile(field.Grid());
///////////////////////////////////// /////////////////////////////////////
// Scidac Private Record structure // Scidac Private Record structure
@ -227,10 +225,10 @@ class GridLimeReader : public BinaryIO {
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl; // std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites; uint64_t PayloadSize = sizeof(sobj) * field.Grid()->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl; // std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field._grid->_gsites<<std::endl; // std::cout << "R Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl; // std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl; // std::cout << "R file size " <<file_bytes <<std::endl;
@ -406,7 +404,7 @@ class GridLimeWriter : public BinaryIO
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Write a generic lattice field and csum // Write a generic lattice field and csum
// This routine is Collectively called by all nodes // This routine is Collectively called by all nodes
// in communicator used by the field._grid // in communicator used by the field.Grid()
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template<class vobj> template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name) void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
@ -425,8 +423,8 @@ class GridLimeWriter : public BinaryIO
// v) Continue writing scidac record. // v) Continue writing scidac record.
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
GridBase *grid = field._grid; GridBase *grid = field.Grid();
assert(boss_node == field._grid->IsBoss() ); assert(boss_node == field.Grid()->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field); FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
@ -443,7 +441,7 @@ class GridLimeWriter : public BinaryIO
} }
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl; // std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl; // std::cout << "W Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl; // std::cout << "W Payload expected " <<PayloadSize<<std::endl;
//////////////////////////////////////////////// ////////////////////////////////////////////////
@ -515,7 +513,7 @@ class ScidacWriter : public GridLimeWriter {
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord, void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0) const unsigned int recordScientificPrec = 0)
{ {
GridBase * grid = field._grid; GridBase * grid = field.Grid();
//////////////////////////////////////// ////////////////////////////////////////
// fill the Grid header // fill the Grid header
@ -557,7 +555,7 @@ class ScidacReader : public GridLimeReader {
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase * grid = field._grid; GridBase * grid = field.Grid();
//////////////////////////////////////// ////////////////////////////////////////
// fill the Grid header // fill the Grid header
@ -624,7 +622,7 @@ class IldgWriter : public ScidacWriter {
template <class vsimd> template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{ {
GridBase * grid = Umu._grid; GridBase * grid = Umu.Grid();
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj; typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@ -717,9 +715,9 @@ class IldgReader : public GridLimeReader {
typedef LorentzColourMatrixF fobj; typedef LorentzColourMatrixF fobj;
typedef LorentzColourMatrixD dobj; typedef LorentzColourMatrixD dobj;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
std::vector<int> dims = Umu._grid->FullDimensions(); Coordinate dims = Umu.Grid()->FullDimensions();
assert(dims.size()==4); assert(dims.size()==4);
@ -853,6 +851,7 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum // Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format // Since this is an ILDG reader require ILDG format
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
assert(found_ildgLFN);
assert(found_ildgBinary); assert(found_ildgBinary);
assert(found_ildgFormat); assert(found_ildgFormat);
assert(found_scidacChecksum); assert(found_scidacChecksum);
@ -930,9 +929,9 @@ class IldgReader : public GridLimeReader {
} }
}; };
}} NAMESPACE_END(Grid);
//HAVE_LIME //HAVE_LIME
#endif #endif
#endif

View File

@ -23,7 +23,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ILDGTYPES_IO_H #ifndef GRID_ILDGTYPES_IO_H
#define GRID_ILDGTYPES_IO_H #define GRID_ILDGTYPES_IO_H
@ -32,7 +32,7 @@ extern "C" { // for linkage
#include "lime.h" #include "lime.h"
} }
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Data representation of records that enter ILDG and SciDac formats // Data representation of records that enter ILDG and SciDac formats
@ -51,12 +51,12 @@ namespace Grid {
// Unused SCIDAC records names; could move to support this functionality // Unused SCIDAC records names; could move to support this functionality
#define SCIDAC_SITELIST "scidac-sitelist" #define SCIDAC_SITELIST "scidac-sitelist"
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat
const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat
const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// QIO uses mandatory "private" records fixed format // QIO uses mandatory "private" records fixed format
@ -74,7 +74,7 @@ struct emptyUserRecord : Serializable {
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile> // <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
//////////////////////// ////////////////////////
struct scidacFile : Serializable { struct scidacFile : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile, GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
double, version, double, version,
int, spacetime, int, spacetime,
@ -91,7 +91,7 @@ struct scidacFile : Serializable {
return dimensions; return dimensions;
} }
void setDimensions(std::vector<int> dimensions) { void setDimensions(Coordinate dimensions) {
char delimiter = ' '; char delimiter = ' ';
std::stringstream stream; std::stringstream stream;
for(int i=0;i<dimensions.size();i++){ for(int i=0;i<dimensions.size();i++){
@ -124,7 +124,7 @@ struct scidacFile : Serializable {
/////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////
struct scidacRecord : Serializable { struct scidacRecord : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord, GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
double, version, double, version,
std::string, date, std::string, date,
@ -160,7 +160,7 @@ public:
// USQCD info // USQCD info
//////////////////////// ////////////////////////
struct usqcdInfo : Serializable { struct usqcdInfo : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo, GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
double, version, double, version,
double, plaq, double, plaq,
@ -174,7 +174,7 @@ struct usqcdInfo : Serializable {
// Scidac Checksum // Scidac Checksum
//////////////////////// ////////////////////////
struct scidacChecksum : Serializable { struct scidacChecksum : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum, GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
double, version, double, version,
std::string, suma, std::string, suma,
@ -201,7 +201,7 @@ struct scidacChecksum : Serializable {
// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf // From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
struct usqcdPropFile : Serializable { struct usqcdPropFile : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile, GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
double, version, double, version,
std::string, type, std::string, type,
@ -211,7 +211,7 @@ struct usqcdPropFile : Serializable {
}; };
}; };
struct usqcdSourceInfo : Serializable { struct usqcdSourceInfo : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo, GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
double, version, double, version,
std::string, info); std::string, info);
@ -220,7 +220,7 @@ struct usqcdSourceInfo : Serializable {
}; };
}; };
struct usqcdPropInfo : Serializable { struct usqcdPropInfo : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo, GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
double, version, double, version,
int, spin, int, spin,
@ -232,6 +232,6 @@ struct usqcdPropInfo : Serializable {
}; };
#endif #endif
} NAMESPACE_END(Grid);
#endif #endif
#endif #endif

View File

@ -36,23 +36,24 @@
#include <sys/utsname.h> #include <sys/utsname.h>
#include <pwd.h> #include <pwd.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// Precision mapping // Precision mapping
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
template<class vobj> static std::string getFormatString (void) template<class vobj> static std::string getFormatString (void)
{ {
std::string format; std::string format;
typedef typename getPrecision<vobj>::real_scalar_type stype; typedef typename getPrecision<vobj>::real_scalar_type stype;
if ( sizeof(stype) == sizeof(float) ) { if ( sizeof(stype) == sizeof(float) ) {
format = std::string("IEEE32BIG"); format = std::string("IEEE32BIG");
}
if ( sizeof(stype) == sizeof(double) ) {
format = std::string("IEEE64BIG");
}
return format;
} }
if ( sizeof(stype) == sizeof(double) ) {
format = std::string("IEEE64BIG");
}
return format;
};
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// header specification/interpretation // header specification/interpretation
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -93,146 +94,145 @@ namespace Grid {
link_trace(0.), plaquette(0.), checksum(0), link_trace(0.), plaquette(0.), checksum(0),
scidac_checksuma(0), scidac_checksumb(0), sequence_number(0) scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
{} {}
}; };
namespace QCD { // PB disable using namespace - this is a header and forces namesapce visibility for all
// including files
//using namespace Grid;
using namespace Grid; //////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data
//////////////////////////////////////////////////////////////////////
inline void GridMetaData(GridBase *grid,FieldMetaData &header)
{
int nd = grid->_ndimension;
header.nd = nd;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}
for(int d=0;d<nd;d++) {
header.boundary[d] = std::string("PERIODIC");
}
}
inline void MachineCharacteristics(FieldMetaData &header)
{
// Who
struct passwd *pw = getpwuid (getuid());
if (pw) header.creator = std::string(pw->pw_name);
////////////////////////////////////////////////////////////////////// // When
// Bit and Physical Checksumming and QA of data std::time_t t = std::time(nullptr);
////////////////////////////////////////////////////////////////////// std::tm tm_ = *std::localtime(&t);
inline void GridMetaData(GridBase *grid,FieldMetaData &header) std::ostringstream oss;
{ // oss << std::put_time(&tm_, "%c %Z");
int nd = grid->_ndimension; header.creation_date = oss.str();
header.nd = nd; header.archive_date = header.creation_date;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}
for(int d=0;d<nd;d++) {
header.boundary[d] = std::string("PERIODIC");
}
}
inline void MachineCharacteristics(FieldMetaData &header) // What
{ struct utsname name; uname(&name);
// Who header.creator_hardware = std::string(name.nodename)+"-";
struct passwd *pw = getpwuid (getuid()); header.creator_hardware+= std::string(name.machine)+"-";
if (pw) header.creator = std::string(pw->pw_name); header.creator_hardware+= std::string(name.sysname)+"-";
header.creator_hardware+= std::string(name.release);
// When }
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str();
header.archive_date = header.creation_date;
// What
struct utsname name; uname(&name);
header.creator_hardware = std::string(name.nodename)+"-";
header.creator_hardware+= std::string(name.machine)+"-";
header.creator_hardware+= std::string(name.sysname)+"-";
header.creator_hardware+= std::string(name.release);
}
#define dump_meta_data(field, s) \ #define dump_meta_data(field, s) \
s << "BEGIN_HEADER" << std::endl; \ s << "BEGIN_HEADER" << std::endl; \
s << "HDR_VERSION = " << field.hdr_version << std::endl; \ s << "HDR_VERSION = " << field.hdr_version << std::endl; \
s << "DATATYPE = " << field.data_type << std::endl; \ s << "DATATYPE = " << field.data_type << std::endl; \
s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \ s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \
for(int i=0;i<4;i++){ \ for(int i=0;i<4;i++){ \
s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \ s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
} \ } \
s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \ s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \ s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \
for(int i=0;i<4;i++){ \ for(int i=0;i<4;i++){ \
s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl; \ s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl; \
} \ } \
\ \
s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \ s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \ s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \ s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl; \ s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl; \
s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl; \ s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl; \
s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl; \ s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl; \
s << "CREATOR = " << field.creator << std::endl; \ s << "CREATOR = " << field.creator << std::endl; \
s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl; \ s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl; \
s << "CREATION_DATE = " << field.creation_date << std::endl; \ s << "CREATION_DATE = " << field.creation_date << std::endl; \
s << "ARCHIVE_DATE = " << field.archive_date << std::endl; \ s << "ARCHIVE_DATE = " << field.archive_date << std::endl; \
s << "FLOATING_POINT = " << field.floating_point << std::endl; \ s << "FLOATING_POINT = " << field.floating_point << std::endl; \
s << "END_HEADER" << std::endl; s << "END_HEADER" << std::endl;
template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header) template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
{ {
GridBase *grid = field._grid; GridBase *grid = field.Grid();
std::string format = getFormatString<vobj>(); std::string format = getFormatString<vobj>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header); GridMetaData(grid,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header) inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{ {
// How to convert data precision etc... // How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data); header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data); header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
} }
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header) inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{ {
// How to convert data precision etc... // How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data); header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data); header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
} }
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{ {
GridBase *grid = field._grid; GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>(); std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header); GridMetaData(grid,header);
GaugeStatistics(field,header); GaugeStatistics(field,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{ {
GridBase *grid = field._grid; GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixD>(); std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header); GridMetaData(grid,header);
GaugeStatistics(field,header); GaugeStatistics(field,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Utilities ; these are QCD aware // Utilities ; these are QCD aware
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm) inline void reconstruct3(LorentzColourMatrix & cm)
{ {
const int x=0; const int x=0;
const int y=1; const int y=1;
const int z=2; const int z=2;
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
} }
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage // Some data types for intermediate storage
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >; template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3; typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F; typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D; typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Simple classes for precision conversion // Simple classes for precision conversion
@ -276,56 +276,55 @@ struct BinarySimpleMunger {
}; };
template<class fobj,class sobj> template<class fobj,class sobj>
struct GaugeSimpleMunger{ struct GaugeSimpleMunger{
void operator()(fobj &in, sobj &out) { void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) { for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) { for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j); out(mu)()(i, j) = in(mu)()(i, j);
}} }}
} }
}; };
}; };
template <class fobj, class sobj> template <class fobj, class sobj>
struct GaugeSimpleUnmunger { struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) { void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) { for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) { for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j); out(mu)()(i, j) = in(mu)()(i, j);
}} }}
} }
}; };
}; };
template<class fobj,class sobj> template<class fobj,class sobj>
struct Gauge3x2munger{ struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){ void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){ for(int i=0;i<2;i++){
for(int j=0;j<3;j++){ for(int j=0;j<3;j++){
out(mu)()(i,j) = in(mu)(i)(j); out(mu)()(i,j) = in(mu)(i)(j);
}} }}
} }
reconstruct3(out); reconstruct3(out);
}
};
template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}
}
};
} }
};
template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}
}
};
NAMESPACE_END(Grid);
}

View File

@ -30,334 +30,330 @@
#ifndef GRID_NERSC_IO_H #ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H #define GRID_NERSC_IO_H
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
using namespace Grid; using namespace Grid;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Write and read from fstream; comput header offset for payload // Write and read from fstream; comput header offset for payload
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO { class NerscIO : public BinaryIO {
public: public:
static inline void truncate(std::string file){ static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out); std::ofstream fout(file,std::ios::out);
} }
static inline unsigned int writeHeader(FieldMetaData &field,std::string file) static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
{ {
std::ofstream fout(file,std::ios::out|std::ios::in); std::ofstream fout(file,std::ios::out|std::ios::in);
fout.seekp(0,std::ios::beg); fout.seekp(0,std::ios::beg);
dump_meta_data(field, fout); dump_meta_data(field, fout);
field.data_start = fout.tellp(); field.data_start = fout.tellp();
return field.data_start; return field.data_start;
} }
// for the header-reader // for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field) static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{ {
uint64_t offset=0; std::map<std::string,std::string> header;
std::map<std::string,std::string> header; std::string line;
std::string line;
////////////////////////////////////////////////// //////////////////////////////////////////////////
// read the header // read the header
////////////////////////////////////////////////// //////////////////////////////////////////////////
std::ifstream fin(file); std::ifstream fin(file);
getline(fin,line); // read one line and insist is getline(fin,line); // read one line and insist is
removeWhitespace(line); removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl; std::cout << GridLogMessage << "* " << line << std::endl;
assert(line==std::string("BEGIN_HEADER")); assert(line==std::string("BEGIN_HEADER"));
do { do {
getline(fin,line); // read one line getline(fin,line); // read one line
std::cout << GridLogMessage << "* "<<line<< std::endl; std::cout << GridLogMessage << "* "<<line<< std::endl;
int eq = line.find("="); int eq = line.find("=");
if(eq >0) { if(eq >0) {
std::string key=line.substr(0,eq); std::string key=line.substr(0,eq);
std::string val=line.substr(eq+1); std::string val=line.substr(eq+1);
removeWhitespace(key); removeWhitespace(key);
removeWhitespace(val); removeWhitespace(val);
header[key] = val; header[key] = val;
} }
} while( line.find("END_HEADER") == std::string::npos ); } while( line.find("END_HEADER") == std::string::npos );
field.data_start = fin.tellg(); field.data_start = fin.tellg();
////////////////////////////////////////////////// //////////////////////////////////////////////////
// chomp the values // chomp the values
////////////////////////////////////////////////// //////////////////////////////////////////////////
field.hdr_version = header["HDR_VERSION"]; field.hdr_version = header["HDR_VERSION"];
field.data_type = header["DATATYPE"]; field.data_type = header["DATATYPE"];
field.storage_format = header["STORAGE_FORMAT"]; field.storage_format = header["STORAGE_FORMAT"];
field.dimension[0] = std::stol(header["DIMENSION_1"]); field.dimension[0] = std::stol(header["DIMENSION_1"]);
field.dimension[1] = std::stol(header["DIMENSION_2"]); field.dimension[1] = std::stol(header["DIMENSION_2"]);
field.dimension[2] = std::stol(header["DIMENSION_3"]); field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]); field.dimension[3] = std::stol(header["DIMENSION_4"]);
assert(grid->_ndimension == 4); assert(grid->_ndimension == 4);
for(int d=0;d<4;d++){ for(int d=0;d<4;d++){
assert(grid->_fdimensions[d]==field.dimension[d]); assert(grid->_fdimensions[d]==field.dimension[d]);
} }
field.link_trace = std::stod(header["LINK_TRACE"]); field.link_trace = std::stod(header["LINK_TRACE"]);
field.plaquette = std::stod(header["PLAQUETTE"]); field.plaquette = std::stod(header["PLAQUETTE"]);
field.boundary[0] = header["BOUNDARY_1"]; field.boundary[0] = header["BOUNDARY_1"];
field.boundary[1] = header["BOUNDARY_2"]; field.boundary[1] = header["BOUNDARY_2"];
field.boundary[2] = header["BOUNDARY_3"]; field.boundary[2] = header["BOUNDARY_3"];
field.boundary[3] = header["BOUNDARY_4"]; field.boundary[3] = header["BOUNDARY_4"];
field.checksum = std::stoul(header["CHECKSUM"],0,16); field.checksum = std::stoul(header["CHECKSUM"],0,16);
field.ensemble_id = header["ENSEMBLE_ID"]; field.ensemble_id = header["ENSEMBLE_ID"];
field.ensemble_label = header["ENSEMBLE_LABEL"]; field.ensemble_label = header["ENSEMBLE_LABEL"];
field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]); field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]);
field.creator = header["CREATOR"]; field.creator = header["CREATOR"];
field.creator_hardware = header["CREATOR_HARDWARE"]; field.creator_hardware = header["CREATOR_HARDWARE"];
field.creation_date = header["CREATION_DATE"]; field.creation_date = header["CREATION_DATE"];
field.archive_date = header["ARCHIVE_DATE"]; field.archive_date = header["ARCHIVE_DATE"];
field.floating_point = header["FLOATING_POINT"]; field.floating_point = header["FLOATING_POINT"];
return field.data_start; return field.data_start;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header,
std::string file)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header);
FieldMetaData clone(header);
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
} }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// GaugeStatistics(Umu,clone);
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd> std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, <<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
FieldMetaData& header, std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
std::string file) <<" header "<<header.plaquette<<std::endl;
{ std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; <<" header "<<header.link_trace<<std::endl;
GridBase *grid = Umu._grid; if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
uint64_t offset = readHeader(file,Umu._grid,header); std::cout << " Plaquette mismatch "<<std::endl;
}
FieldMetaData clone(header); if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl;
std::string format(header.floating_point); std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
int ieee32big = (format == std::string("IEEE32BIG")); std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
int ieee32 = (format == std::string("IEEE32")); exit(0);
int ieee64big = (format == std::string("IEEE64BIG")); }
int ieee64 = (format == std::string("IEEE64")); assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
uint32_t nersc_csum,scidac_csuma,scidac_csumb; assert(nersc_csum == header.checksum );
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
}
GaugeStatistics(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
<<" header "<<header.plaquette<<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
<<" header "<<header.link_trace<<std::endl;
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl;
std::cout << Umu[0]<<std::endl;
std::cout << Umu[1]<<std::endl;
}
if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl;
std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
} }
template<class vsimd> template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
std::string file, std::string file,
int two_row, int two_row,
int bits32) int bits32)
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj; typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
FieldMetaData header; FieldMetaData header;
/////////////////////////////////////////// ///////////////////////////////////////////
// Following should become arguments // Following should become arguments
/////////////////////////////////////////// ///////////////////////////////////////////
header.sequence_number = 1; header.sequence_number = 1;
header.ensemble_id = "UKQCD"; header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF"; header.ensemble_label = "DWF";
typedef LorentzColourMatrixD fobj3D; typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D; typedef LorentzColour2x3D fobj2D;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
GaugeStatistics(Umu,header); GaugeStatistics(Umu,header);
MachineCharacteristics(header); MachineCharacteristics(header);
uint64_t offset; uint64_t offset;
// Sod it -- always write 3x3 double // Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG"); header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3"); header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge; GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
truncate(file); truncate(file);
offset = writeHeader(header,file); offset = writeHeader(header,file);
} }
grid->Broadcast(0,(void *)&offset,sizeof(offset)); grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point, BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum; header.checksum = nersc_csum;
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
writeHeader(header,file); writeHeader(header,file);
} }
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum " std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum <<std::hex<<header.checksum
<<std::dec<<" plaq "<< header.plaquette <<std::endl; <<std::dec<<" plaq "<< header.plaquette <<std::endl;
} }
/////////////////////////////// ///////////////////////////////
// RNG state // RNG state
/////////////////////////////// ///////////////////////////////
static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file) static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
{ {
typedef typename GridParallelRNG::RngStateType RngStateType; typedef typename GridParallelRNG::RngStateType RngStateType;
// Following should become arguments // Following should become arguments
FieldMetaData header; FieldMetaData header;
header.sequence_number = 1; header.sequence_number = 1;
header.ensemble_id = "UKQCD"; header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF"; header.ensemble_label = "DWF";
GridBase *grid = parallel._grid; GridBase *grid = parallel.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
header.link_trace=0.0; header.link_trace=0.0;
header.plaquette=0.0; header.plaquette=0.0;
MachineCharacteristics(header); MachineCharacteristics(header);
uint64_t offset; uint64_t offset;
#ifdef RNG_RANLUX #ifdef RNG_RANLUX
header.floating_point = std::string("UINT64"); header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48"); header.data_type = std::string("RANLUX48");
#endif #endif
#ifdef RNG_MT19937 #ifdef RNG_MT19937
header.floating_point = std::string("UINT32"); header.floating_point = std::string("UINT32");
header.data_type = std::string("MT19937"); header.data_type = std::string("MT19937");
#endif #endif
#ifdef RNG_SITMO #ifdef RNG_SITMO
header.floating_point = std::string("UINT64"); header.floating_point = std::string("UINT64");
header.data_type = std::string("SITMO"); header.data_type = std::string("SITMO");
#endif #endif
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
truncate(file); truncate(file);
offset = writeHeader(header,file); offset = writeHeader(header,file);
} }
grid->Broadcast(0,(void *)&offset,sizeof(offset)); grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb); BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum; header.checksum = nersc_csum;
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
offset = writeHeader(header,file); offset = writeHeader(header,file);
} }
std::cout<<GridLogMessage std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum " <<"Written NERSC RNG STATE "<<file<< " checksum "
<<std::hex<<header.checksum <<std::hex<<header.checksum
<<std::dec<<std::endl; <<std::dec<<std::endl;
} }
static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file) static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
{ {
typedef typename GridParallelRNG::RngStateType RngStateType; typedef typename GridParallelRNG::RngStateType RngStateType;
GridBase *grid = parallel._grid; GridBase *grid = parallel.Grid();
uint64_t offset = readHeader(file,grid,header); uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header); FieldMetaData clone(header);
std::string format(header.floating_point); std::string format(header.floating_point);
std::string data_type(header.data_type); std::string data_type(header.data_type);
#ifdef RNG_RANLUX #ifdef RNG_RANLUX
assert(format == std::string("UINT64")); assert(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48")); assert(data_type == std::string("RANLUX48"));
#endif #endif
#ifdef RNG_MT19937 #ifdef RNG_MT19937
assert(format == std::string("UINT32")); assert(format == std::string("UINT32"));
assert(data_type == std::string("MT19937")); assert(data_type == std::string("MT19937"));
#endif #endif
#ifdef RNG_SITMO #ifdef RNG_SITMO
assert(format == std::string("UINT64")); assert(format == std::string("UINT64"));
assert(data_type == std::string("SITMO")); assert(data_type == std::string("SITMO"));
#endif #endif
// depending on datatype, set up munger; // depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type> // munger is a function of <floating point, Real, data_type>
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb); BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
if ( nersc_csum != header.checksum ) { if ( nersc_csum != header.checksum ) {
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl; std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0); exit(0);
} }
assert(nersc_csum == header.checksum ); assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl; std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
} }
};
}; NAMESPACE_END(QCD);
}}
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -23,13 +23,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B) #define RawConfig(A,B) (A<<8|B)
@ -39,16 +39,16 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES}, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS}, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES }, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4 // 4
#ifdef KNL #ifdef KNL
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES }, { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS }, { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS }, { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS }, { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS }, { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS }, { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS }, { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
// 11 // 11
#else #else
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS},
@ -57,19 +57,20 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
// 11 // 11
#endif #endif
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
//15 //15
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS}, { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS}, { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS} { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS}
//19 //19
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" }, // { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif #endif
}; };
} NAMESPACE_END(Grid);

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -25,8 +25,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_PERFCOUNT_H #ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H #define GRID_PERFCOUNT_H
@ -47,7 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
#ifdef __linux__ #ifdef __linux__
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
@ -84,11 +84,14 @@ inline uint64_t cyclecount(void){
#ifdef __bgq__ #ifdef __bgq__
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
uint64_t tmp; uint64_t tmp;
asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) ); asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) );
return tmp; return tmp;
} }
#elif defined __x86_64__ #elif defined __x86_64__
#ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; }
#endif
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); return __rdtsc();
// unsigned int dummy; // unsigned int dummy;
@ -97,7 +100,7 @@ inline uint64_t cyclecount(void){
#else #else
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return 0; return 0;
} }
#endif #endif
@ -212,7 +215,7 @@ public:
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
ign=::read(fd, &count, sizeof(long long)); ign=::read(fd, &count, sizeof(long long));
ign+=::read(cyclefd, &cycles, sizeof(long long)); ign+=::read(cyclefd, &cycles, sizeof(long long));
assert(ign=2*sizeof(long long)); assert(ign==2*sizeof(long long));
} }
elapsed = cyclecount() - begin; elapsed = cyclecount() - begin;
#else #else
@ -225,8 +228,8 @@ public:
int N = PerformanceCounterConfigs[PCT].normalisation; int N = PerformanceCounterConfigs[PCT].normalisation;
const char * sn = PerformanceCounterConfigs[N].name ; const char * sn = PerformanceCounterConfigs[N].name ;
const char * sc = PerformanceCounterConfigs[PCT].name; const char * sc = PerformanceCounterConfigs[PCT].name;
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles, std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
sc, count, sc,sn, (double)count/(double)cycles); sc, count, sc,sn, (double)count/(double)cycles);
#else #else
std::printf("%llu cycles \n", elapsed ); std::printf("%llu cycles \n", elapsed );
#endif #endif
@ -241,5 +244,6 @@ public:
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -2,7 +2,7 @@
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Stat.h> #include <Grid/perfmon/Stat.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
bool PmuStat::pmu_initialized=false; bool PmuStat::pmu_initialized=false;
@ -175,39 +175,39 @@ void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
} }
void PmuStat::KNLsetup(void){ void PmuStat::KNLsetup(void){
int ret; int ret;
char fname[1024]; char fname[1024];
// MC RPQ inserts and WPQ inserts (reads & writes) // MC RPQ inserts and WPQ inserts (reads & writes)
for (int mc = 0; mc < NMC; ++mc) for (int mc = 0; mc < NMC; ++mc)
{ {
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc); ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
// RPQ Inserts // RPQ Inserts
KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1); KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
// WPQ Inserts // WPQ Inserts
KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1); KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
} }
// EDC RPQ inserts and WPQ inserts // EDC RPQ inserts and WPQ inserts
for (int edc=0; edc < NEDC; ++edc) for (int edc=0; edc < NEDC; ++edc)
{ {
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc); ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
// RPQ inserts // RPQ inserts
KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1); KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
// WPQ inserts // WPQ inserts
KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1); KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
} }
// EDC HitE, HitM, MissE, MissM // EDC HitE, HitM, MissE, MissM
for (int edc=0; edc < NEDC; ++edc) for (int edc=0; edc < NEDC; ++edc)
{ {
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc); ::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1); KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2); KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4); KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8); KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
} }
} }
uint64_t PmuStat::KNLreadctr(int fd) uint64_t PmuStat::KNLreadctr(int fd)
{ {
@ -242,4 +242,5 @@ void PmuStat::KNLreadctrs(ctrs &c)
} }
#endif #endif
} NAMESPACE_END(Grid);

View File

@ -5,7 +5,7 @@
#define _KNIGHTS_LANDING_ROOTONLY #define _KNIGHTS_LANDING_ROOTONLY
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Extra KNL counters from MCDRAM // Extra KNL counters from MCDRAM
@ -15,14 +15,14 @@ namespace Grid {
#define NEDC 8 #define NEDC 8
struct ctrs struct ctrs
{ {
uint64_t mcrd[NMC]; uint64_t mcrd[NMC];
uint64_t mcwr[NMC]; uint64_t mcwr[NMC];
uint64_t edcrd[NEDC]; uint64_t edcrd[NEDC];
uint64_t edcwr[NEDC]; uint64_t edcwr[NEDC];
uint64_t edchite[NEDC]; uint64_t edchite[NEDC];
uint64_t edchitm[NEDC]; uint64_t edchitm[NEDC];
uint64_t edcmisse[NEDC]; uint64_t edcmisse[NEDC];
uint64_t edcmissm[NEDC]; uint64_t edcmissm[NEDC];
}; };
// Peter/Azusa: // Peter/Azusa:
// Our modification of a code provided by Larry Meadows from Intel // Our modification of a code provided by Larry Meadows from Intel
@ -44,61 +44,62 @@ struct knl_gbl_
class PmuStat class PmuStat
{ {
uint64_t counters[8][256]; uint64_t counters[8][256];
#ifdef _KNIGHTS_LANDING_ #ifdef _KNIGHTS_LANDING_
static struct knl_gbl_ gbl; static struct knl_gbl_ gbl;
#endif #endif
const char *name; const char *name;
uint64_t reads; // memory reads uint64_t reads; // memory reads
uint64_t writes; // memory writes uint64_t writes; // memory writes
uint64_t mrstart; // memory read counter at start of parallel region uint64_t mrstart; // memory read counter at start of parallel region
uint64_t mrend; // memory read counter at end of parallel region uint64_t mrend; // memory read counter at end of parallel region
uint64_t mwstart; // memory write counter at start of parallel region uint64_t mwstart; // memory write counter at start of parallel region
uint64_t mwend; // memory write counter at end of parallel region uint64_t mwend; // memory write counter at end of parallel region
// cumulative counters // cumulative counters
uint64_t count; // number of invocations uint64_t count; // number of invocations
uint64_t tregion; // total time in parallel region (from thread 0) uint64_t tregion; // total time in parallel region (from thread 0)
uint64_t tcycles; // total cycles inside parallel region uint64_t tcycles; // total cycles inside parallel region
uint64_t inst, ref, cyc; // fixed counters uint64_t inst, ref, cyc; // fixed counters
uint64_t pmc0, pmc1;// pmu uint64_t pmc0, pmc1;// pmu
// add memory counters here // add memory counters here
// temp variables // temp variables
uint64_t tstart; // tsc at start of parallel region uint64_t tstart; // tsc at start of parallel region
uint64_t tend; // tsc at end of parallel region uint64_t tend; // tsc at end of parallel region
// map for ctrs values // map for ctrs values
// 0 pmc0 start // 0 pmc0 start
// 1 pmc0 end // 1 pmc0 end
// 2 pmc1 start // 2 pmc1 start
// 3 pmc1 end // 3 pmc1 end
// 4 tsc start // 4 tsc start
// 5 tsc end // 5 tsc end
static bool pmu_initialized; static bool pmu_initialized;
public: public:
static bool is_init(void){ return pmu_initialized;} static bool is_init(void){ return pmu_initialized;}
static void pmu_init(void); static void pmu_init(void);
static void pmu_fini(void); static void pmu_fini(void);
static void pmu_start(void); static void pmu_start(void);
static void pmu_stop(void); static void pmu_stop(void);
void accum(int nthreads); void accum(int nthreads);
static void xmemctrs(uint64_t *mr, uint64_t *mw); static void xmemctrs(uint64_t *mr, uint64_t *mw);
void start(void); void start(void);
void enter(int t); void enter(int t);
void exit(int t); void exit(int t);
void print(void); void print(void);
void init(const char *regname); void init(const char *regname);
void clear(void); void clear(void);
#ifdef _KNIGHTS_LANDING_ #ifdef _KNIGHTS_LANDING_
static void KNLsetup(void); static void KNLsetup(void);
static uint64_t KNLreadctr(int fd); static uint64_t KNLreadctr(int fd);
static void KNLreadctrs(ctrs &c); static void KNLreadctrs(ctrs &c);
static void KNLevsetup(const char *ename, int &fd, int event, int umask); static void KNLevsetup(const char *ename, int &fd, int event, int umask);
#endif #endif
}; };
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -24,8 +24,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_TIME_H #ifndef GRID_TIME_H
#define GRID_TIME_H #define GRID_TIME_H
@ -33,11 +33,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <ctime> #include <ctime>
#include <chrono> #include <chrono>
namespace Grid { NAMESPACE_BEGIN(Grid)
// Dress the output; use std::chrono
// Dress the output; use std::chrono
// C++11 time facilities better? // C++11 time facilities better?
inline double usecond(void) { inline double usecond(void) {
struct timeval tv; struct timeval tv;
@ -80,7 +78,7 @@ inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
stream.fill(fill); stream.fill(fill);
return stream; return stream;
} }
class GridStopWatch { class GridStopWatch {
private: private:
@ -125,5 +123,6 @@ public:
} }
}; };
} NAMESPACE_END(Grid)
#endif #endif

View File

@ -14,7 +14,12 @@
#ifndef SOURCE_PUGIXML_CPP #ifndef SOURCE_PUGIXML_CPP
#define SOURCE_PUGIXML_CPP #define SOURCE_PUGIXML_CPP
#include <Grid/pugixml/pugixml.h> #ifdef __NVCC__
#pragma push
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#include "pugixml.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
@ -202,7 +207,7 @@ PUGI__NS_BEGIN
// Without a template<> we'll get multiple definitions of the same static // Without a template<> we'll get multiple definitions of the same static
template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate; template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate; template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
template struct xml_memory_management_function_storage<int>;
typedef xml_memory_management_function_storage<int> xml_memory; typedef xml_memory_management_function_storage<int> xml_memory;
PUGI__NS_END PUGI__NS_END
@ -12768,6 +12773,10 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC
#pragma pop
#endif
#endif #endif
/** /**

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -27,114 +27,112 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_QCD_BASE_H #pragma once
#define GRID_QCD_BASE_H
namespace Grid{
namespace QCD {
static const int Xdir = 0; NAMESPACE_BEGIN(Grid);
static const int Ydir = 1;
static const int Zdir = 2;
static const int Tdir = 3;
static constexpr int Xdir = 0;
static const int Xp = 0; static constexpr int Ydir = 1;
static const int Yp = 1; static constexpr int Zdir = 2;
static const int Zp = 2; static constexpr int Tdir = 3;
static const int Tp = 3;
static const int Xm = 4;
static const int Ym = 5;
static const int Zm = 6;
static const int Tm = 7;
static const int Nc=3; static constexpr int Xp = 0;
static const int Ns=4; static constexpr int Yp = 1;
static const int Nd=4; static constexpr int Zp = 2;
static const int Nhs=2; // half spinor static constexpr int Tp = 3;
static const int Nds=8; // double stored gauge field static constexpr int Xm = 4;
static const int Ngp=2; // gparity index range static constexpr int Ym = 5;
static constexpr int Zm = 6;
static constexpr int Tm = 7;
////////////////////////////////////////////////////////////////////////////// static constexpr int Nc=3;
// QCD iMatrix types static constexpr int Ns=4;
// Index conventions: Lorentz x Spin x Colour static constexpr int Nd=4;
// note: static const int or constexpr will work for type deductions static constexpr int Nhs=2; // half spinor
// with the intel compiler (up to version 17) static constexpr int Nds=8; // double stored gauge field
////////////////////////////////////////////////////////////////////////////// static constexpr int Ngp=2; // gparity index range
#define ColourIndex 2
#define SpinIndex 1
#define LorentzIndex 0
// Also should make these a named enum type //////////////////////////////////////////////////////////////////////////////
static const int DaggerNo=0; // QCD iMatrix types
static const int DaggerYes=1; // Index conventions: Lorentz x Spin x Colour
static const int InverseNo=0; // note: static constexpr int or constexpr will work for type deductions
static const int InverseYes=1; // with the intel compiler (up to version 17)
//////////////////////////////////////////////////////////////////////////////
#define ColourIndex (2)
#define SpinIndex (1)
#define LorentzIndex (0)
// Useful traits is this a spin index // Also should make these a named enum type
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; static constexpr int DaggerNo=0;
static constexpr int DaggerYes=1;
static constexpr int InverseNo=0;
static constexpr int InverseYes=1;
const int SpinorIndex = 2; // Useful traits is this a spin index
template<typename T> struct isSpinor { //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
static const bool value = (SpinorIndex==T::TensorLevel);
};
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
// ChrisK very keen to add extra space for Gparity doubling. const int SpinorIndex = 2;
// template<typename T> struct isSpinor {
// Also add domain wall index, in a way where Wilson operator static constexpr bool value = (SpinorIndex==T::TensorLevel);
// naturally distributes across the 5th dimensions. };
// template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
// That probably makes for GridRedBlack4dCartesian grid. template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
// s,sp,c,spc,lc // ChrisK very keen to add extra space for Gparity doubling.
//
// Also add domain wall index, in a way where Wilson operator
// naturally distributes across the 5th dimensions.
//
// That probably makes for GridRedBlack4dCartesian grid.
template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >; // s,sp,c,spc,lc
template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ; template<typename vtype> using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >; template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ; template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ; template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >; template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >; template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >; template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >; template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >; template<typename vtype> using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >; template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >; template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >; template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
// Spin matrix // Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix; typedef iSpinMatrix<Complex > SpinMatrix;
typedef iSpinMatrix<ComplexF > SpinMatrixF; typedef iSpinMatrix<ComplexF > SpinMatrixF;
typedef iSpinMatrix<ComplexD > SpinMatrixD; typedef iSpinMatrix<ComplexD > SpinMatrixD;
typedef iSpinMatrix<vComplex > vSpinMatrix; typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iSpinMatrix<vComplexF> vSpinMatrixF; typedef iSpinMatrix<vComplexF> vSpinMatrixF;
typedef iSpinMatrix<vComplexD> vSpinMatrixD; typedef iSpinMatrix<vComplexD> vSpinMatrixD;
// Colour Matrix // Colour Matrix
typedef iColourMatrix<Complex > ColourMatrix; typedef iColourMatrix<Complex > ColourMatrix;
typedef iColourMatrix<ComplexF > ColourMatrixF; typedef iColourMatrix<ComplexF > ColourMatrixF;
typedef iColourMatrix<ComplexD > ColourMatrixD; typedef iColourMatrix<ComplexD > ColourMatrixD;
typedef iColourMatrix<vComplex > vColourMatrix; typedef iColourMatrix<vComplex > vColourMatrix;
typedef iColourMatrix<vComplexF> vColourMatrixF; typedef iColourMatrix<vComplexF> vColourMatrixF;
typedef iColourMatrix<vComplexD> vColourMatrixD; typedef iColourMatrix<vComplexD> vColourMatrixD;
// SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iSpinColourMatrix<ComplexF > SpinColourMatrixF;
typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
// SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix;
typedef iSpinColourMatrix<ComplexF > SpinColourMatrixF;
typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF;
@ -153,383 +151,379 @@ namespace QCD {
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
// LorentzColour // LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix; typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF; typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD; typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix; typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF; typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD; typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
// DoubleStored gauge field // DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
// Spin vector // Spin vector
typedef iSpinVector<Complex > SpinVector; typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF; typedef iSpinVector<ComplexF> SpinVectorF;
typedef iSpinVector<ComplexD> SpinVectorD; typedef iSpinVector<ComplexD> SpinVectorD;
typedef iSpinVector<vComplex > vSpinVector; typedef iSpinVector<vComplex > vSpinVector;
typedef iSpinVector<vComplexF> vSpinVectorF; typedef iSpinVector<vComplexF> vSpinVectorF;
typedef iSpinVector<vComplexD> vSpinVectorD; typedef iSpinVector<vComplexD> vSpinVectorD;
// Colour vector // Colour vector
typedef iColourVector<Complex > ColourVector; typedef iColourVector<Complex > ColourVector;
typedef iColourVector<ComplexF> ColourVectorF; typedef iColourVector<ComplexF> ColourVectorF;
typedef iColourVector<ComplexD> ColourVectorD; typedef iColourVector<ComplexD> ColourVectorD;
typedef iColourVector<vComplex > vColourVector; typedef iColourVector<vComplex > vColourVector;
typedef iColourVector<vComplexF> vColourVectorF; typedef iColourVector<vComplexF> vColourVectorF;
typedef iColourVector<vComplexD> vColourVectorD; typedef iColourVector<vComplexD> vColourVectorD;
// SpinColourVector // SpinColourVector
typedef iSpinColourVector<Complex > SpinColourVector; typedef iSpinColourVector<Complex > SpinColourVector;
typedef iSpinColourVector<ComplexF> SpinColourVectorF; typedef iSpinColourVector<ComplexF> SpinColourVectorF;
typedef iSpinColourVector<ComplexD> SpinColourVectorD; typedef iSpinColourVector<ComplexD> SpinColourVectorD;
typedef iSpinColourVector<vComplex > vSpinColourVector; typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSpinColourVector<vComplexF> vSpinColourVectorF; typedef iSpinColourVector<vComplexF> vSpinColourVectorF;
typedef iSpinColourVector<vComplexD> vSpinColourVectorD; typedef iSpinColourVector<vComplexD> vSpinColourVectorD;
// HalfSpin vector // HalfSpin vector
typedef iHalfSpinVector<Complex > HalfSpinVector; typedef iHalfSpinVector<Complex > HalfSpinVector;
typedef iHalfSpinVector<ComplexF> HalfSpinVectorF; typedef iHalfSpinVector<ComplexF> HalfSpinVectorF;
typedef iHalfSpinVector<ComplexD> HalfSpinVectorD; typedef iHalfSpinVector<ComplexD> HalfSpinVectorD;
typedef iHalfSpinVector<vComplex > vHalfSpinVector; typedef iHalfSpinVector<vComplex > vHalfSpinVector;
typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF; typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF;
typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD; typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD;
// HalfSpinColour vector // HalfSpinColour vector
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector; typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF; typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD; typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector; typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF; typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD; typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
// singlets // singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<ComplexF> TComplexF; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tensor singlet complex type.
typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure
typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure
typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without? typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without? typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealD> TRealD; // Shouldn't need these; can I make it work without? typedef iSinglet<RealD> TRealD; // Shouldn't need these; can I make it work without?
typedef iSinglet<vReal > vTReal; typedef iSinglet<vReal > vTReal;
typedef iSinglet<vRealF> vTRealF; typedef iSinglet<vRealF> vTRealF;
typedef iSinglet<vRealD> vTRealD; typedef iSinglet<vRealD> vTRealD;
typedef iSinglet<vInteger> vTInteger; typedef iSinglet<vInteger> vTInteger;
typedef iSinglet<Integer > TInteger; typedef iSinglet<Integer > TInteger;
// Lattices of these // Lattices of these
typedef Lattice<vColourMatrix> LatticeColourMatrix; typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vColourMatrixF> LatticeColourMatrixF; typedef Lattice<vColourMatrixF> LatticeColourMatrixF;
typedef Lattice<vColourMatrixD> LatticeColourMatrixD; typedef Lattice<vColourMatrixD> LatticeColourMatrixD;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix; typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF; typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF;
typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD; typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix; typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF; typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF;
typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD; typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD;
typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix; typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix;
typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF; typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF;
typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD; typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD;
typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix; typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF; typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD; typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
// DoubleStored gauge field // DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix; typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF; typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD; typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
typedef Lattice<vSpinVector> LatticeSpinVector; typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vSpinVectorF> LatticeSpinVectorF; typedef Lattice<vSpinVectorF> LatticeSpinVectorF;
typedef Lattice<vSpinVectorD> LatticeSpinVectorD; typedef Lattice<vSpinVectorD> LatticeSpinVectorD;
typedef Lattice<vColourVector> LatticeColourVector; typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vColourVectorF> LatticeColourVectorF; typedef Lattice<vColourVectorF> LatticeColourVectorF;
typedef Lattice<vColourVectorD> LatticeColourVectorD; typedef Lattice<vColourVectorD> LatticeColourVectorD;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector; typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF; typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF;
typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD; typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector; typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF; typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF;
typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD; typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector; typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF; typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD; typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
typedef Lattice<vTReal> LatticeReal; typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTRealF> LatticeRealF; typedef Lattice<vTRealF> LatticeRealF;
typedef Lattice<vTRealD> LatticeRealD; typedef Lattice<vTRealD> LatticeRealD;
typedef Lattice<vTComplex> LatticeComplex; typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTComplexF> LatticeComplexF; typedef Lattice<vTComplexF> LatticeComplexF;
typedef Lattice<vTComplexD> LatticeComplexD; typedef Lattice<vTComplexD> LatticeComplexD;
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where" typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
/////////////////////////////////////////// ///////////////////////////////////////////
// Physical names for things // Physical names for things
/////////////////////////////////////////// ///////////////////////////////////////////
typedef LatticeHalfSpinColourVector LatticeHalfFermion; typedef LatticeHalfSpinColourVector LatticeHalfFermion;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF; typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD; typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
typedef LatticeSpinColourVector LatticeFermion; typedef LatticeSpinColourVector LatticeFermion;
typedef LatticeSpinColourVectorF LatticeFermionF; typedef LatticeSpinColourVectorF LatticeFermionF;
typedef LatticeSpinColourVectorD LatticeFermionD; typedef LatticeSpinColourVectorD LatticeFermionD;
typedef LatticeSpinColourMatrix LatticePropagator; typedef LatticeSpinColourMatrix LatticePropagator;
typedef LatticeSpinColourMatrixF LatticePropagatorF; typedef LatticeSpinColourMatrixF LatticePropagatorF;
typedef LatticeSpinColourMatrixD LatticePropagatorD; typedef LatticeSpinColourMatrixD LatticePropagatorD;
typedef LatticeLorentzColourMatrix LatticeGaugeField; typedef LatticeLorentzColourMatrix LatticeGaugeField;
typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF; typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF;
typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD; typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD;
typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField; typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField;
typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF; typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF;
typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD; typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD;
template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >; template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
// Uhgg... typing this hurt ;) // Uhgg... typing this hurt ;)
// (my keyboard got burning hot when I typed this, must be the anti-Fermion) // (my keyboard got burning hot when I typed this, must be the anti-Fermion)
typedef Lattice<vColourVector> LatticeStaggeredFermion; typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourVectorF> LatticeStaggeredFermionF; typedef Lattice<vColourVectorF> LatticeStaggeredFermionF;
typedef Lattice<vColourVectorD> LatticeStaggeredFermionD; typedef Lattice<vColourVectorD> LatticeStaggeredFermionD;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator; typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF; typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF;
typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD; typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD;
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Peek and Poke named after physics attributes // Peek and Poke named after physics attributes
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
//spin //spin
template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0)) template<class vobj> auto peekSpin(const vobj &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{ {
return PeekIndex<SpinIndex>(rhs,i); return PeekIndex<SpinIndex>(rhs,i);
} }
template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0)) template<class vobj> auto peekSpin(const vobj &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{ {
return PeekIndex<SpinIndex>(rhs,i,j); return PeekIndex<SpinIndex>(rhs,i,j);
} }
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0)) template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<SpinIndex>(rhs,0))
{ {
return PeekIndex<SpinIndex>(rhs,i); return PeekIndex<SpinIndex>(rhs,i);
} }
template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0)) template<class vobj> auto peekSpin(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<SpinIndex>(rhs,0,0))
{ {
return PeekIndex<SpinIndex>(rhs,i,j); return PeekIndex<SpinIndex>(rhs,i,j);
} }
//colour //colour
template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0)) template<class vobj> auto peekColour(const vobj &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{ {
return PeekIndex<ColourIndex>(rhs,i); return PeekIndex<ColourIndex>(rhs,i);
} }
template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0)) template<class vobj> auto peekColour(const vobj &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{ {
return PeekIndex<ColourIndex>(rhs,i,j); return PeekIndex<ColourIndex>(rhs,i,j);
} }
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0)) template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<ColourIndex>(rhs,0))
{ {
return PeekIndex<ColourIndex>(rhs,i); return PeekIndex<ColourIndex>(rhs,i);
} }
template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0)) template<class vobj> auto peekColour(const Lattice<vobj> &rhs,int i,int j) -> decltype(PeekIndex<ColourIndex>(rhs,0,0))
{ {
return PeekIndex<ColourIndex>(rhs,i,j); return PeekIndex<ColourIndex>(rhs,i,j);
} }
//lorentz //lorentz
template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0)) template<class vobj> auto peekLorentz(const vobj &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{ {
return PeekIndex<LorentzIndex>(rhs,i); return PeekIndex<LorentzIndex>(rhs,i);
} }
template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0)) template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltype(PeekIndex<LorentzIndex>(rhs,0))
{ {
return PeekIndex<LorentzIndex>(rhs,i); return PeekIndex<LorentzIndex>(rhs,i);
} }
////////////////////////////////////////////// //////////////////////////////////////////////
// Poke lattice // Poke lattice
////////////////////////////////////////////// //////////////////////////////////////////////
template<class vobj> template<class vobj>
void pokeColour(Lattice<vobj> &lhs, void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs, const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0))> & rhs,
int i)
{
PokeIndex<ColourIndex>(lhs,rhs,i);
}
template<class vobj>
void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0,0))> & rhs,
int i,int j)
{
PokeIndex<ColourIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0))> & rhs,
int i) int i)
{ {
PokeIndex<ColourIndex>(lhs,rhs,i); PokeIndex<SpinIndex>(lhs,rhs,i);
} }
template<class vobj> template<class vobj>
void pokeColour(Lattice<vobj> &lhs, void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs, const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0,0))> & rhs,
int i,int j) int i,int j)
{ {
PokeIndex<ColourIndex>(lhs,rhs,i,j); PokeIndex<SpinIndex>(lhs,rhs,i,j);
} }
template<class vobj> template<class vobj>
void pokeSpin(Lattice<vobj> &lhs, void pokeLorentz(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs, const Lattice<decltype(peekIndex<LorentzIndex>(vobj(),0))> & rhs,
int i) int i)
{ {
PokeIndex<SpinIndex>(lhs,rhs,i); PokeIndex<LorentzIndex>(lhs,rhs,i);
} }
template<class vobj>
void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
int i,int j)
{
PokeIndex<SpinIndex>(lhs,rhs,i,j);
}
template<class vobj>
void pokeLorentz(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
int i)
{
PokeIndex<LorentzIndex>(lhs,rhs,i);
}
////////////////////////////////////////////// //////////////////////////////////////////////
// Poke scalars // Poke scalars
////////////////////////////////////////////// //////////////////////////////////////////////
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i) template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
{ {
pokeIndex<SpinIndex>(lhs,rhs,i); pokeIndex<SpinIndex>(lhs,rhs,i);
} }
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j) template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0,0)) & rhs,int i,int j)
{ {
pokeIndex<SpinIndex>(lhs,rhs,i,j); pokeIndex<SpinIndex>(lhs,rhs,i,j);
} }
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i) template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0)) & rhs,int i)
{ {
pokeIndex<ColourIndex>(lhs,rhs,i); pokeIndex<ColourIndex>(lhs,rhs,i);
} }
template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j) template<class vobj> void pokeColour(vobj &lhs,const decltype(peekIndex<ColourIndex>(lhs,0,0)) & rhs,int i,int j)
{ {
pokeIndex<ColourIndex>(lhs,rhs,i,j); pokeIndex<ColourIndex>(lhs,rhs,i,j);
} }
template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i) template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<LorentzIndex>(lhs,0)) & rhs,int i)
{ {
pokeIndex<LorentzIndex>(lhs,rhs,i); pokeIndex<LorentzIndex>(lhs,rhs,i);
} }
////////////////////////////////////////////// //////////////////////////////////////////////
// Fermion <-> propagator assignements // Fermion <-> propagator assignements
////////////////////////////////////////////// //////////////////////////////////////////////
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
template <class Fimpl> template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c) void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
{ {
for(int j = 0; j < Ns; ++j) auto pjs = peekSpin(p, j, s);
{ auto fj = peekSpin(f, j);
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i) for(int i = 0; i < Fimpl::Dimension; ++i)
{ {
pokeColour(pjs, peekColour(fj, i), i, c); pokeColour(pjs, peekColour(fj, i), i, c);
} }
pokeSpin(p, pjs, j, s); pokeSpin(p, pjs, j, s);
}
} }
}
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
template <class Fimpl> template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
for(int j = 0; j < Ns; ++j)
{ {
for(int j = 0; j < Ns; ++j) auto pjs = peekSpin(p, j, s);
{ auto fj = peekSpin(f, j);
auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i) for(int i = 0; i < Fimpl::Dimension; ++i)
{ {
pokeColour(fj, peekColour(pjs, i, c), i); pokeColour(fj, peekColour(pjs, i, c), i);
} }
pokeSpin(f, fj, j); pokeSpin(f, fj, j);
}
} }
}
////////////////////////////////////////////// //////////////////////////////////////////////
// transpose array and scalar // transpose array and scalar
////////////////////////////////////////////// //////////////////////////////////////////////
template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){ template<int Index,class vobj> inline Lattice<vobj> transposeSpin(const Lattice<vobj> &lhs){
return transposeIndex<SpinIndex>(lhs); return transposeIndex<SpinIndex>(lhs);
} }
template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){ template<int Index,class vobj> inline Lattice<vobj> transposeColour(const Lattice<vobj> &lhs){
return transposeIndex<ColourIndex>(lhs); return transposeIndex<ColourIndex>(lhs);
} }
template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){ template<int Index,class vobj> inline vobj transposeSpin(const vobj &lhs){
return transposeIndex<SpinIndex>(lhs); return transposeIndex<SpinIndex>(lhs);
} }
template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){ template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
return transposeIndex<ColourIndex>(lhs); return transposeIndex<ColourIndex>(lhs);
} }
////////////////////////////////////////// //////////////////////////////////////////
// Trace lattice and non-lattice // Trace lattice and non-lattice
////////////////////////////////////////// //////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))> inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
{ {
return traceIndex<SpinIndex>(lhs); return traceIndex<SpinIndex>(lhs);
} }
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))> inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
{ {
return traceIndex<ColourIndex>(lhs); return traceIndex<ColourIndex>(lhs);
} }
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))> inline auto traceSpin(const vobj &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs))>
{ {
return traceIndex<SpinIndex>(lhs); return traceIndex<SpinIndex>(lhs);
} }
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))> inline auto traceColour(const vobj &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs))>
{ {
return traceIndex<ColourIndex>(lhs); return traceIndex<ColourIndex>(lhs);
} }
////////////////////////////////////////// //////////////////////////////////////////
// Current types // Current types
////////////////////////////////////////// //////////////////////////////////////////
GRID_SERIALIZABLE_ENUM(Current, undef, GRID_SERIALIZABLE_ENUM(Current, undef,
Vector, 0, Vector, 0,
Axial, 1, Axial, 1,
Tadpole, 2); Tadpole, 2);
} //namespace QCD NAMESPACE_END(Grid);
} // Grid
#endif

View File

@ -37,14 +37,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// Abstract base interface // Abstract base interface
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/ActionCore.h> #include <Grid/qcd/action/ActionCore.h>
NAMESPACE_CHECK(ActionCore);
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Fermion actions; prevent coupling fermion.cc files to other headers // Fermion actions; prevent coupling fermion.cc files to other headers
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h> #include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_CHECK(FermionCore);
#include <Grid/qcd/action/fermion/Fermion.h> #include <Grid/qcd/action/fermion/Fermion.h>
NAMESPACE_CHECK(Fermion);
//////////////////////////////////////// ////////////////////////////////////////
// Pseudo fermion combinations for HMC // Pseudo fermion combinations for HMC
//////////////////////////////////////// ////////////////////////////////////////
#include <Grid/qcd/action/pseudofermion/PseudoFermion.h> #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
NAMESPACE_CHECK(PseudoFermion);
#endif #endif

View File

@ -27,19 +27,18 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef ACTION_BASE_H #ifndef ACTION_BASE_H
#define ACTION_BASE_H #define ACTION_BASE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
template <class GaugeField > template <class GaugeField >
class Action class Action
{ {
public: public:
bool is_smeared = false; bool is_smeared = false;
// Heatbath? // Heatbath?
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
@ -50,7 +49,6 @@ class Action
virtual ~Action(){} virtual ~Action(){}
}; };
} NAMESPACE_END(Grid);
}
#endif // ACTION_BASE_H #endif // ACTION_BASE_H

View File

@ -31,29 +31,37 @@ directory
#define QCD_ACTION_CORE #define QCD_ACTION_CORE
#include <Grid/qcd/action/ActionBase.h> #include <Grid/qcd/action/ActionBase.h>
NAMESPACE_CHECK(ActionBase);
#include <Grid/qcd/action/ActionSet.h> #include <Grid/qcd/action/ActionSet.h>
NAMESPACE_CHECK(ActionSet);
#include <Grid/qcd/action/ActionParams.h> #include <Grid/qcd/action/ActionParams.h>
NAMESPACE_CHECK(ActionParams);
//////////////////////////////////////////// ////////////////////////////////////////////
// Gauge Actions // Gauge Actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/gauge/Gauge.h> #include <Grid/qcd/action/gauge/Gauge.h>
NAMESPACE_CHECK(Gauge);
//////////////////////////////////////////// ////////////////////////////////////////////
// Fermion prereqs // Fermion prereqs
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h> #include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_CHECK(ActionFermionCore);
//////////////////////////////////////////// ////////////////////////////////////////////
// Scalar Actions // Scalar Actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/scalar/Scalar.h> #include <Grid/qcd/action/scalar/Scalar.h>
NAMESPACE_CHECK(Scalar);
//////////////////////////////////////////// ////////////////////////////////////////////
// Utility functions // Utility functions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/utils/Metric.h> #include <Grid/qcd/utils/Metric.h>
NAMESPACE_CHECK(Metric);
#include <Grid/qcd/utils/CovariantLaplacian.h> #include <Grid/qcd/utils/CovariantLaplacian.h>
NAMESPACE_CHECK(CovariantLaplacian);

View File

@ -27,37 +27,35 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_QCD_ACTION_PARAMS_H #ifndef GRID_QCD_ACTION_PARAMS_H
#define GRID_QCD_ACTION_PARAMS_H #define GRID_QCD_ACTION_PARAMS_H
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
// These can move into a params header and be given MacroMagic serialisation // These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams { struct GparityWilsonImplParams {
bool overlapCommsCompute; Coordinate twists;
std::vector<int> twists; GparityWilsonImplParams() : twists(Nd, 0) {};
GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){}; };
};
struct WilsonImplParams { struct WilsonImplParams {
bool overlapCommsCompute; bool overlapCommsCompute;
std::vector<Real> twist_n_2pi_L; AcceleratorVector<Real,Nd> twist_n_2pi_L;
std::vector<Complex> boundary_phases; AcceleratorVector<Complex,Nd> boundary_phases;
WilsonImplParams() : overlapCommsCompute(false) { WilsonImplParams() {
boundary_phases.resize(Nd, 1.0); boundary_phases.resize(Nd, 1.0);
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
};
WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0);
}
}; };
WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0);
}
};
struct StaggeredImplParams { struct StaggeredImplParams {
StaggeredImplParams() {}; StaggeredImplParams() {};
}; };
struct OneFlavourRationalParams : Serializable { struct OneFlavourRationalParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams,
@ -69,10 +67,10 @@ namespace QCD {
int, precision, int, precision,
int, BoundsCheckFreq); int, BoundsCheckFreq);
// MaxIter and tolerance, vectors?? // MaxIter and tolerance, vectors??
// constructor // constructor
OneFlavourRationalParams( RealD _lo = 0.0, OneFlavourRationalParams( RealD _lo = 0.0,
RealD _hi = 1.0, RealD _hi = 1.0,
int _maxit = 1000, int _maxit = 1000,
RealD tol = 1.0e-8, RealD tol = 1.0e-8,
@ -88,11 +86,6 @@ namespace QCD {
BoundsCheckFreq(_BoundsCheckFreq){}; BoundsCheckFreq(_BoundsCheckFreq){};
}; };
NAMESPACE_END(Grid);
}
}
#endif #endif

View File

@ -26,14 +26,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
See the full license in the file "LICENSE" in the top level distribution See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef ACTION_SET_H #ifndef ACTION_SET_H
#define ACTION_SET_H #define ACTION_SET_H
namespace Grid { NAMESPACE_BEGIN(Grid);
// Should drop this namespace here
namespace QCD {
////////////////////////////////// //////////////////////////////////
// Indexing of tuple types // Indexing of tuple types
@ -62,7 +59,7 @@ struct Index<T, std::tuple<U, Types...>> {
template <class Field, class Repr = NoHirep > template <class Field, class Repr = NoHirep >
struct ActionLevel { struct ActionLevel {
public: public:
unsigned int multiplier; unsigned int multiplier;
// Fundamental repr actions separated because of the smearing // Fundamental repr actions separated because of the smearing
@ -77,7 +74,7 @@ struct ActionLevel {
std::vector<ActPtr>& actions; std::vector<ActPtr>& actions;
explicit ActionLevel(unsigned int mul = 1) : explicit ActionLevel(unsigned int mul = 1) :
actions(std::get<0>(actions_hirep)), multiplier(mul) { actions(std::get<0>(actions_hirep)), multiplier(mul) {
// initialize the hirep vectors to zero. // initialize the hirep vectors to zero.
// apply(this->resize, actions_hirep, 0); //need a working resize // apply(this->resize, actions_hirep, 0); //need a working resize
assert(mul >= 1); assert(mul >= 1);
@ -87,7 +84,7 @@ struct ActionLevel {
void push_back(Action<GenField>* ptr) { void push_back(Action<GenField>* ptr) {
// insert only in the correct vector // insert only in the correct vector
std::get< Index < GenField, action_hirep_types>::value >(actions_hirep).push_back(ptr); std::get< Index < GenField, action_hirep_types>::value >(actions_hirep).push_back(ptr);
}; }
template <class ActPtr> template <class ActPtr>
static void resize(ActPtr ap, unsigned int n) { static void resize(ActPtr ap, unsigned int n) {
@ -110,7 +107,6 @@ struct ActionLevel {
template <class GaugeField, class R> template <class GaugeField, class R>
using ActionSet = std::vector<ActionLevel<GaugeField, R> >; using ActionSet = std::vector<ActionLevel<GaugeField, R> >;
} // QCD NAMESPACE_END(Grid);
} // Grid
#endif // ACTION_SET_H #endif // ACTION_SET_H

Some files were not shown because too many files have changed in this diff Show More