1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-19 15:34:45 +01:00

Compare commits

..

1 Commits

Author SHA1 Message Date
Peter Boyle
ac1d655de8 Const correctness patch 2018-11-19 10:38:36 +00:00
1000 changed files with 55560 additions and 62569 deletions

1
.gitignore vendored
View File

@@ -114,4 +114,3 @@ gh-pages/
##################### #####################
Grid/qcd/spin/gamma-gen/*.h Grid/qcd/spin/gamma-gen/*.h
Grid/qcd/spin/gamma-gen/*.cc Grid/qcd/spin/gamma-gen/*.cc
Grid/util/Version.h

View File

@@ -1,5 +0,0 @@
Version : 0.8.0
- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
- MPI and MPI3 comms optimisations for KNL and OPA finished
- Half precision comms

View File

@@ -30,34 +30,8 @@ directory
#ifndef DISABLE_WARNINGS_H #ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H #define DISABLE_WARNINGS_H
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
//disables and intel compiler specific warning (in json.hpp) //disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488 #pragma warning disable 488
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC
#ifdef __ALTIVEC__
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef __VSX__
#define EIGEN_DONT_VECTORIZE
#endif
#endif #endif

View File

@@ -42,7 +42,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridQCDcore.h> #include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/Action.h> #include <Grid/qcd/action/Action.h>
#include <Grid/qcd/utils/GaugeFix.h> #include <Grid/qcd/utils/GaugeFix.h>
#include <Grid/qcd/utils/CovariantSmearing.h>
#include <Grid/qcd/smearing/Smearing.h> #include <Grid/qcd/smearing/Smearing.h>
#include <Grid/parallelIO/MetaData.h> #include <Grid/parallelIO/MetaData.h>
#include <Grid/qcd/hmc/HMC_aggregate.h> #include <Grid/qcd/hmc/HMC_aggregate.h>

View File

@@ -38,19 +38,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_BASE_H #ifndef GRID_BASE_H
#define GRID_BASE_H #define GRID_BASE_H
#include <Grid/DisableWarnings.h>
#include <Grid/Namespace.h>
#include <Grid/GridStd.h> #include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h> #include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h> #include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/Threads.h>
#include <Grid/serialisation/Serialisation.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h>
#include <Grid/util/Util.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h> #include <Grid/cartesian/Cartesian.h>
@@ -60,6 +57,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/stencil/Stencil.h> #include <Grid/stencil/Stencil.h>
#include <Grid/parallelIO/BinaryIO.h> #include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h> #include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore)
#endif #endif

View File

@@ -38,6 +38,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);
#endif #endif

View File

@@ -7,7 +7,6 @@
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <vector> #include <vector>
#include <array>
#include <string> #include <string>
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>

View File

@@ -1,41 +1,14 @@
#include <Grid/GridCore.h>
#pragma once #pragma once
// Force Eigen to use MKL if Grid has been configured with --enable-mkl // Force Eigen to use MKL if Grid has been configured with --enable-mkl
#ifdef USE_MKL #ifdef USE_MKL
#define EIGEN_USE_MKL_ALL #define EIGEN_USE_MKL_ALL
#endif #endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif #endif
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#pragma diag_suppress code_is_unreachable
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __NVCC__
#undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__
#endif
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
/* NVCC restore */
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@@ -1 +0,0 @@
#include <Grid/Grid_Eigen_Dense.h>

View File

@@ -1,38 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Namespace.h
Copyright (C) 2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <type_traits>
#include <cassert>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );

View File

@@ -35,7 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Zolotarev.h> #include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h> #include <Grid/algorithms/approx/Chebyshev.h>
#include <Grid/algorithms/approx/JacobiPolynomial.h>
#include <Grid/algorithms/approx/Remez.h> #include <Grid/algorithms/approx/Remez.h>
#include <Grid/algorithms/approx/MultiShiftFunction.h> #include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h> #include <Grid/algorithms/approx/Forecast.h>
@@ -49,16 +48,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h> #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
#include <Grid/algorithms/iterative/MinimalResidual.h>
#include <Grid/algorithms/iterative/GeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h>
#include <Grid/algorithms/CoarsenedMatrix.h> #include <Grid/algorithms/CoarsenedMatrix.h>
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
// EigCg
// Pcg
// Hdcg
// GCR
// etc..
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
@@ -115,9 +115,9 @@ private:
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
Coordinate dimensions; std::vector<int> dimensions;
Coordinate processors; std::vector<int> processors;
Coordinate processor_coor; std::vector<int> processor_coor;
public: public:
@@ -137,7 +137,7 @@ public:
{ {
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1); std::vector<int> layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors); sgrid = new GridCartesian(dimensions,layout,processors);
}; };
@@ -146,10 +146,10 @@ public:
} }
template<class vobj> template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){ void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
conformable(result.Grid(),vgrid); conformable(result._grid,vgrid);
conformable(source.Grid(),vgrid); conformable(source._grid,vgrid);
Lattice<vobj> tmp(vgrid); Lattice<vobj> tmp(vgrid);
tmp = source; tmp = source;
for(int d=0;d<Nd;d++){ for(int d=0;d<Nd;d++){
@@ -162,7 +162,7 @@ public:
template<class vobj> template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1); std::vector<int> mask(Nd,1);
FFT_dim_mask(result,source,mask,sign); FFT_dim_mask(result,source,mask,sign);
} }
@@ -172,14 +172,14 @@ public:
#ifndef HAVE_FFTW #ifndef HAVE_FFTW
assert(0); assert(0);
#else #else
conformable(result.Grid(),vgrid); conformable(result._grid,vgrid);
conformable(source.Grid(),vgrid); conformable(source._grid,vgrid);
int L = vgrid->_ldimensions[dim]; int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim]; int G = vgrid->_fdimensions[dim];
Coordinate layout(Nd,1); std::vector<int> layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions); std::vector<int> pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim]; pencil_gd[dim] = G*processors[dim];
@@ -191,7 +191,7 @@ public:
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View();
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -217,8 +217,8 @@ public:
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany, p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed, in,inembed,
istride,idist, istride,idist,
@@ -228,20 +228,26 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
Coordinate lcoor(Nd), gcoor(Nd); std::vector<int> lcoor(Nd), gcoor(Nd);
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
thread_for(idx, sgrid->lSites(),{ PARALLEL_REGION
Coordinate cbuf(Nd); {
std::vector<int> cbuf(Nd);
sobj s; sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf); sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf); peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L; cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L; // cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf); pokeLocalSite(s,pgbuf,cbuf);
}); }
if (p != processors[dim] - 1) { }
if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L); result = Cshift(result,dim,L);
} }
} }
@@ -250,15 +256,20 @@ public:
int NN=pencil_g.lSites(); int NN=pencil_g.lSites();
GridStopWatch timer; GridStopWatch timer;
timer.Start(); timer.Start();
thread_for( idx,NN,{ PARALLEL_REGION
Coordinate cbuf(Nd); {
std::vector<int> cbuf(Nd);
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx, cbuf); pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx]; FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out); FFTW<scalar>::fftw_execute_dft(p,in,out);
} }
}); }
}
timer.Stop(); timer.Stop();
// performance counting // performance counting
@@ -269,15 +280,20 @@ public:
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
thread_for(idx,sgrid->lSites(),{ PARALLEL_REGION
Coordinate clbuf(Nd), cgbuf(Nd); {
std::vector<int> clbuf(Nd), cgbuf(Nd);
sobj s; sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,clbuf); sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result,clbuf);
}); }
}
result = result*div; result = result*div;
// destroying plan // destroying plan
@@ -285,7 +301,6 @@ public:
#endif #endif
} }
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -26,15 +26,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_ALGORITHM_LINEAR_OP_H
#define GRID_ALGORITHM_LINEAR_OP_H
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// LinearOperators Take a something and return a something. // LinearOperators Take a something and return a something.
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// //
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian Conjugateugate (transpose if real): // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real):
//SBase //SBase
// i) F(a x + b y) = aF(x) + b F(y). // i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast // ii) <x|Op|y> = <y|AdjOp|x>^\ast
@@ -47,7 +48,6 @@ public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
virtual void OpDirAll (const Field &in, std::vector<Field> &out) = 0; // Abstract base
virtual void Op (const Field &in, Field &out) = 0; // Abstract base virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
@@ -84,9 +84,6 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
@@ -97,7 +94,8 @@ public:
_Mat.MdagM(in,out,n1,n2); _Mat.MdagM(in,out,n1,n2);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
_Mat.MdagM(in,out); RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
} }
}; };
@@ -119,9 +117,6 @@ public:
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
assert(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
assert(0); assert(0);
@@ -160,9 +155,6 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
} }
@@ -171,6 +163,7 @@ public:
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out); _Mat.M(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot); ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
@@ -179,35 +172,6 @@ public:
} }
}; };
template<class Matrix,class Field>
class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several // Even Odd Schur decomp operators; there are several
// ways to introduce the even odd checkerboarding // ways to introduce the even odd checkerboarding
@@ -219,13 +183,13 @@ public:
virtual RealD Mpc (const Field &in, Field &out) =0; virtual RealD Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0; virtual RealD MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp(in.Grid()); Field tmp(in._grid);
tmp.Checkerboard() = in.Checkerboard(); tmp.checkerboard = in.checkerboard;
ni=Mpc(in,tmp); ni=Mpc(in,tmp);
no=MpcDag(tmp,out); no=MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.Checkerboard() = in.Checkerboard(); out.checkerboard = in.checkerboard;
MpcDagMpc(in,out,n1,n2); MpcDagMpc(in,out,n1,n2);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
@@ -245,30 +209,28 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> { class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
public: protected:
Matrix &_Mat; Matrix &_Mat;
public:
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in._grid);
tmp.Checkerboard() = !in.Checkerboard(); tmp.checkerboard = !in.checkerboard;
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl; //std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp); _Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out); _Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl; //std::cout << "cb in " << in.checkerboard << " cb out " << out.checkerboard << std::endl;
_Mat.Mooee(in,out); _Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out); return axpy_norm(out,-1.0,tmp,out);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in._grid);
_Mat.MeooeDag(in,tmp); _Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out); _Mat.MooeeInvDag(tmp,out);
@@ -286,7 +248,7 @@ public:
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in._grid);
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp); _Mat.MooeeInv(out,tmp);
@@ -296,7 +258,7 @@ public:
return axpy_norm(out,-1.0,tmp,in); return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in._grid);
_Mat.MooeeInvDag(in,out); _Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
@@ -314,7 +276,7 @@ public:
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in._grid);
_Mat.MooeeInv(in,out); _Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
@@ -324,7 +286,7 @@ public:
return axpy_norm(out,-1.0,tmp,in); return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid()); Field tmp(in._grid);
_Mat.MeooeDag(in,out); _Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp); _Mat.MooeeInvDag(out,tmp);
@@ -336,7 +298,7 @@ public:
}; };
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta // Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ; template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ; template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
@@ -392,17 +354,7 @@ public:
axpby(out,-1.0,mass*mass,tmp,in); axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond(); taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) virtual RealD Mpc (const Field &in, Field &out) {
{
Field tmp(in.Grid());
Field tmp2(in.Grid());
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond(); tMeo-=usecond();
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
@@ -428,12 +380,6 @@ template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOpera
template<class Field> class OperatorFunction { template<class Field> class OperatorFunction {
public: public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0; virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size());
for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]);
}
};
}; };
template<class Field> class LinearFunction { template<class Field> class LinearFunction {
@@ -513,15 +459,13 @@ class Polynomial : public OperatorFunction<Field> {
private: private:
std::vector<RealD> Coeffs; std::vector<RealD> Coeffs;
public: public:
using OperatorFunction<Field>::operator();
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { }; Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface // Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in.Grid()); Field AtoN(in._grid);
Field Mtmp(in.Grid()); Field Mtmp(in._grid);
AtoN = in; AtoN = in;
out = AtoN*Coeffs[0]; out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){ for(int n=1;n<Coeffs.size();n++){
@@ -532,4 +476,6 @@ public:
}; };
}; };
NAMESPACE_END(Grid); }
#endif

View File

@@ -28,7 +28,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
#ifndef GRID_PRECONDITIONER_H #ifndef GRID_PRECONDITIONER_H
#define GRID_PRECONDITIONER_H #define GRID_PRECONDITIONER_H
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class Field> class Preconditioner : public LinearFunction<Field> { template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0; virtual void operator()(const Field &src, Field & psi)=0;
@@ -42,5 +42,5 @@ public:
TrivialPrecon(void){}; TrivialPrecon(void){};
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#define GRID_ALGORITHM_SPARSE_MATRIX_H #define GRID_ALGORITHM_SPARSE_MATRIX_H
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action // Interface defining what I expect of a general sparse matrix, such as a Fermion action
@@ -41,17 +41,12 @@ public:
virtual RealD M (const Field &in, Field &out)=0; virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0; virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in.Grid()); Field tmp (in._grid);
ni=M(in,tmp); ni=M(in,tmp);
no=Mdag(tmp,out); no=Mdag(tmp,out);
} }
virtual void MdagM(const Field &in, Field &out) {
RealD ni, no;
MdagM(in,out,ni,no);
}
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
}; };
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@@ -60,14 +55,6 @@ public:
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> { template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public: public:
virtual GridBase *RedBlackGrid(void)=0; virtual GridBase *RedBlackGrid(void)=0;
//////////////////////////////////////////////////////////////////////
// Query the even even properties to make algorithmic decisions
//////////////////////////////////////////////////////////////////////
virtual RealD Mass(void) { return 0.0; };
virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden
virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0; virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0; virtual void Mooee (const Field &in, Field &out)=0;
@@ -79,6 +66,6 @@ public:
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -32,7 +32,7 @@ Author: Christoph Lehner <clehner@bnl.gov>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
struct ChebyParams : Serializable { struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams, GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
@@ -47,8 +47,6 @@ struct ChebyParams : Serializable {
template<class Field> template<class Field>
class Chebyshev : public OperatorFunction<Field> { class Chebyshev : public OperatorFunction<Field> {
private: private:
using OperatorFunction<Field>::operator();
std::vector<RealD> Coeffs; std::vector<RealD> Coeffs;
int order; int order;
RealD hi; RealD hi;
@@ -57,7 +55,7 @@ private:
public: public:
void csv(std::ostream &out){ void csv(std::ostream &out){
RealD diff = hi-lo; RealD diff = hi-lo;
RealD delta = diff*1.0e-9; RealD delta = (hi-lo)*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) { for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1; delta*=1.1;
RealD f = approx(x); RealD f = approx(x);
@@ -95,24 +93,6 @@ public:
Coeffs[order-1] = 1.; Coeffs[order-1] = 1.;
}; };
// PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
// Similar kick effect below the threshold as Lanczos filter approach
void InitLowPass(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD k=(order-1.0);
RealD s=std::cos( j*M_PI*(k+0.5)/order );
Coeffs[j] = s * 2.0/order;
}
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD)) void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{ {
lo=_lo; lo=_lo;
@@ -232,9 +212,9 @@ public:
// Implement the required interface // Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid(); GridBase *grid=in._grid;
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl; // std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl; //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites(); int vol=grid->gSites();
@@ -252,20 +232,20 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
axpby(T1,xscale,mscale,y,in); T1=y*xscale+in*mscale;
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1; out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
for(int n=2;n<order;n++){ for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y); Linop.HermOp(*Tn,y);
// y=xscale*y+mscale*(*Tn);
// *Tnp=2.0*y-(*Tnm); y=xscale*y+mscale*(*Tn);
// out=out+Coeffs[n]* (*Tnp);
axpby(y,xscale,mscale,y,(*Tn)); *Tnp=2.0*y-(*Tnm);
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
axpy(out,Coeffs[n],*Tnp,out); out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies // Cycle pointers to avoid copies
Field *swizzle = Tnm; Field *swizzle = Tnm;
Tnm =Tn; Tnm =Tn;
@@ -341,7 +321,7 @@ public:
// shift_Multiply in Rudy's code // shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out) void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{ {
GridBase *grid=in.Grid(); GridBase *grid=in._grid;
Field tmp(grid); Field tmp(grid);
RealD aa= alpha*alpha; RealD aa= alpha*alpha;
@@ -358,7 +338,7 @@ public:
// Implement the required interface // Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid(); GridBase *grid=in._grid;
int vol=grid->gSites(); int vol=grid->gSites();
@@ -393,5 +373,5 @@ public:
} }
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -31,7 +31,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
#ifndef INCLUDED_FORECAST_H #ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H #define INCLUDED_FORECAST_H
NAMESPACE_BEGIN(Grid); namespace Grid {
// Abstract base class. // Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi) // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
@@ -57,10 +57,10 @@ public:
Field chi(phi); // forecasted solution Field chi(phi); // forecasted solution
// Trivial cases // Trivial cases
if(degree == 0){ chi = Zero(); return chi; } if(degree == 0){ chi = zero; return chi; }
else if(degree == 1){ return prev_solns[0]; } else if(degree == 1){ return prev_solns[0]; }
// RealD dot; RealD dot;
ComplexD xp; ComplexD xp;
Field r(phi); // residual Field r(phi); // residual
Field Mv(phi); Field Mv(phi);
@@ -92,7 +92,7 @@ public:
for(int j=0; j<degree; j++){ for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){ for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]); G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = conjugate(G[j][k]); G[k][j] = std::conj(G[j][k]);
}} }}
// Gauss-Jordan elimination with partial pivoting // Gauss-Jordan elimination with partial pivoting
@@ -100,7 +100,7 @@ public:
// Perform partial pivoting // Perform partial pivoting
int k = i; int k = i;
for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } } for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
if(k != i){ if(k != i){
xp = b[k]; xp = b[k];
b[k] = b[i]; b[k] = b[i];
@@ -121,7 +121,7 @@ public:
} }
// Use Gaussian elimination to solve equations and calculate initial guess // Use Gaussian elimination to solve equations and calculate initial guess
chi = Zero(); chi = zero;
r = phi; r = phi;
for(int i=degree-1; i>=0; i--){ for(int i=degree-1; i>=0; i--){
a[i] = 0.0; a[i] = 0.0;
@@ -136,7 +136,7 @@ public:
for(int i=0; i<degree; i++){ for(int i=0; i<degree; i++){
tmp = -b[i]; tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; } for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = conjugate(tmp)*tmp; tmp = std::conj(tmp)*tmp;
true_r += std::sqrt(tmp.real()); true_r += std::sqrt(tmp.real());
} }
@@ -147,6 +147,6 @@ public:
}; };
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -1,129 +0,0 @@
#ifndef GRID_JACOBIPOLYNOMIAL_H
#define GRID_JACOBIPOLYNOMIAL_H
#include <Grid/algorithms/LinearOperator.h>
NAMESPACE_BEGIN(Grid);
template<class Field>
class JacobiPolynomial : public OperatorFunction<Field> {
private:
using OperatorFunction<Field>::operator();
int order;
RealD hi;
RealD lo;
RealD alpha;
RealD beta;
public:
void csv(std::ostream &out){
csv(out,lo,hi);
}
void csv(std::ostream &out,RealD llo,RealD hhi){
RealD diff = hhi-llo;
RealD delta = diff*1.0e-5;
for (RealD x=llo-delta; x<=hhi; x+=delta) {
RealD f = approx(x);
out<< x<<" "<<f <<std::endl;
}
return;
}
JacobiPolynomial(){};
JacobiPolynomial(RealD _lo,RealD _hi,int _order,RealD _alpha, RealD _beta)
{
lo=_lo;
hi=_hi;
alpha=_alpha;
beta=_beta;
order=_order;
};
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1.0;
RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
Tn =T1;
Tnm=T0;
for(int n=2;n<=order;n++){
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
Tnm=Tn;
Tn =Tnp;
}
return Tnp;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid);
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// RealD T0=1.0;
T0=in;
// RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
// = x * 2/(hi-lo) - (hi+lo)/(hi-lo)
Linop.HermOp(T0,y);
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
y=y*xscale+in*mscale;
// RealD T1=(alpha-beta)*0.5+(alpha+beta+2.0)*0.5*y;
RealD halfAmB = (alpha-beta)*0.5;
RealD halfApBp2= (alpha+beta+2.0)*0.5;
T1 = halfAmB * in + halfApBp2*y;
for(int n=2;n<=order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
RealD cnp = 2.0*n*(n+alpha+beta)*(2.0*n-2.0+alpha+beta);
RealD cny = (2.0*n-2.0+alpha+beta)*(2.0*n-1.0+alpha+beta)*(2.0*n+alpha+beta);
RealD cn1 = (2.0*n+alpha+beta-1.0)*(alpha*alpha-beta*beta);
RealD cnm = - 2.0*(n+alpha-1.0)*(n+beta-1.0)*(2.0*n+alpha+beta);
// Tnp= ( cny * y *Tn + cn1 * Tn + cnm * Tnm )/ cnp;
cny=cny/cnp;
cn1=cn1/cnp;
cn1=cn1/cnp;
cnm=cnm/cnp;
*Tnp=cny*y + cn1 *(*Tn) + cnm * (*Tnm);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
out=*Tnp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -27,8 +27,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
double MultiShiftFunction::approx(double x) double MultiShiftFunction::approx(double x)
{ {
double a = norm; double a = norm;
@@ -54,4 +53,4 @@ void MultiShiftFunction::csv(std::ostream &out)
} }
return; return;
} }
NAMESPACE_END(Grid); }

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef MULTI_SHIFT_FUNCTION #ifndef MULTI_SHIFT_FUNCTION
#define MULTI_SHIFT_FUNCTION #define MULTI_SHIFT_FUNCTION
NAMESPACE_BEGIN(Grid); namespace Grid {
class MultiShiftFunction { class MultiShiftFunction {
public: public:
@@ -63,5 +63,5 @@ public:
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) {
// Search for error maxima and minima // Search for error maxima and minima
void AlgRemez::search(bigfloat *step) { void AlgRemez::search(bigfloat *step) {
bigfloat a, q, xm, ym, xn, yn, xx0, xx1; bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
int i, meq, emsign, ensign, steps; int i, j, meq, emsign, ensign, steps;
meq = neq + 1; meq = neq + 1;
bigfloat *yy = new bigfloat[meq]; bigfloat *yy = new bigfloat[meq];
@@ -306,6 +306,7 @@ void AlgRemez::search(bigfloat *step) {
bigfloat eclose = 1.0e30; bigfloat eclose = 1.0e30;
bigfloat farther = 0l; bigfloat farther = 0l;
j = 1;
xx0 = apstrt; xx0 = apstrt;
for (i = 0; i < meq; i++) { for (i = 0; i < meq; i++) {

View File

@@ -58,8 +58,8 @@
/* Compute the partial fraction expansion coefficients (alpha) from the /* Compute the partial fraction expansion coefficients (alpha) from the
* factored form */ * factored form */
NAMESPACE_BEGIN(Grid); namespace Grid {
NAMESPACE_BEGIN(Approx); namespace Approx {
static void construct_partfrac(izd *z) { static void construct_partfrac(izd *z) {
int dn = z -> dn, dd = z -> dd, type = z -> type; int dn = z -> dn, dd = z -> dd, type = z -> type;
@@ -516,9 +516,7 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
free(d); free(d);
return zd; return zd;
} }
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#ifdef TEST #ifdef TEST
@@ -587,7 +585,6 @@ static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
return (ONE - T) / (ONE + T); return (ONE - T) / (ONE + T);
} }
/* Test program. Apart from printing out the parameters for R(x) it produces /* Test program. Apart from printing out the parameters for R(x) it produces
* the following data files for plotting (unless NPLOT is defined): * the following data files for plotting (unless NPLOT is defined):
* *
@@ -726,5 +723,5 @@ int main(int argc, char** argv) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
#endif /* TEST */
#endif /* TEST */

View File

@@ -1,13 +1,13 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */ /* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
#ifdef __cplusplus #ifdef __cplusplus
#include <Grid/Namespace.h> namespace Grid {
NAMESPACE_BEGIN(Grid); namespace Approx {
NAMESPACE_BEGIN(Approx);
#endif #endif
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL #ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION #ifndef PRECISION
#define PRECISION double #define PRECISION double
@@ -83,6 +83,5 @@ void zolotarev_free(zolotarev_data *zdata);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
NAMESPACE_END(Approx); }}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -10,12 +10,10 @@
#ifndef INCLUDED_BIGFLOAT_H #ifndef INCLUDED_BIGFLOAT_H
#define INCLUDED_BIGFLOAT_H #define INCLUDED_BIGFLOAT_H
#define __GMP_WITHIN_CONFIGURE
#include <gmp.h> #include <gmp.h>
#include <mpf2mpfr.h> #include <mpf2mpfr.h>
#include <mpfr.h> #include <mpfr.h>
#undef __GMP_WITHIN_CONFIGURE
class bigfloat { class bigfloat {
private: private:

View File

@@ -90,8 +90,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi.Checkerboard() = src.Checkerboard(); psi.checkerboard = src.checkerboard;
grid = src.Grid(); grid = src._grid;
RealD f; RealD f;
RealD rtzp,rtz,a,d,b; RealD rtzp,rtz,a,d,b;

View File

@@ -27,11 +27,13 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
NAMESPACE_BEGIN(Grid);
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; namespace Grid {
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction // Block conjugate gradient. Dimension zero should be the block direction
@@ -40,6 +42,7 @@ template <class Field>
class BlockConjugateGradient : public OperatorFunction<Field> { class BlockConjugateGradient : public OperatorFunction<Field> {
public: public:
typedef typename Field::scalar_type scomplex; typedef typename Field::scalar_type scomplex;
int blockDim ; int blockDim ;
@@ -51,15 +54,21 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer PrintInterval; //GridLogMessages or Iterative
BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true) BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100) : Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
{}; {};
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it) // Thin QR factorisation (google it)
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
Field & Q,
const Field & R)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions //Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock // R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
@@ -76,20 +85,22 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
// Cdag C = Rdag R ; passes. // Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes // QdagQ = 1 ; passes
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
Field & Q,
const Field & R)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
sliceInnerProductMatrix(m_rr,R,R,Orthog); sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related // Force manifest hermitian to avoid rounding related
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); #if 0
std::cout << " Calling Cholesky ldlt on m_rr " << m_rr <<std::endl;
Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL();
std::cout << " Called Cholesky ldlt on m_rr " << L_ldlt <<std::endl;
auto D_ldlt = m_rr.ldlt().vectorD();
std::cout << " Called Cholesky ldlt on m_rr " << D_ldlt <<std::endl;
#endif
// std::cout << " Calling Cholesky llt on m_rr " <<std::endl;
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// std::cout << " Called Cholesky llt on m_rr " << L <<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -101,25 +112,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
sliceMulMatrix(Q,Cinv,R,Orthog); sliceMulMatrix(Q,Cinv,R,Orthog);
} }
// see comments above
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
const std::vector<Field> & R)
{
InnerProductMatrix(m_rr,R,R);
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
MulMatrix(Q,Cinv,R);
}
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Call one of several implementations // Call one of several implementations
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -127,20 +119,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{ {
if ( CGtype == BlockCGrQ ) { if ( CGtype == BlockCGrQ ) {
BlockCGrQsolve(Linop,Src,Psi); BlockCGrQsolve(Linop,Src,Psi);
} else if (CGtype == BlockCG ) {
BlockCGsolve(Linop,Src,Psi);
} else if (CGtype == CGmultiRHS ) { } else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi); CGmultiRHSsolve(Linop,Src,Psi);
} else { } else {
assert(0); assert(0);
} }
} }
virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi)
{
if ( CGtype == BlockCGrQVec ) {
BlockCGrQsolveVec(Linop,Src,Psi);
} else {
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// BlockCGrQ implementation: // BlockCGrQ implementation:
@@ -152,12 +138,11 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{ {
int Orthog = blockDim; // First dimension is block dim; this is an assumption int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B.Grid()->_fdimensions[Orthog]; Nblock = B._grid->_fdimensions[Orthog];
/* FAKE */
Nblock=8;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.Checkerboard() = B.Checkerboard(); X.checkerboard = B.checkerboard;
conformable(X, B); conformable(X, B);
Field tmp(B); Field tmp(B);
@@ -217,10 +202,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl; std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
//1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it) //1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
tmp = B - AD; tmp = B - AD;
//std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
//std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
//std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
//std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
//std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
D=Q; D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl; std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -242,12 +232,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
MatrixTimer.Start(); MatrixTimer.Start();
Linop.HermOp(D, Z); Linop.HermOp(D, Z);
MatrixTimer.Stop(); MatrixTimer.Stop();
//std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
//4. M = [D^dag Z]^{-1} //4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start(); sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog); sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop(); sliceInnerTimer.Stop();
m_M = m_DZ.inverse(); m_M = m_DZ.inverse();
//std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
//5. X = X + D MC //5. X = X + D MC
m_tmp = m_M * m_C; m_tmp = m_M * m_C;
@@ -265,7 +257,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
//7. D = Q + D S^dag //7. D = Q + D S^dag
m_tmp = m_S.adjoint(); m_tmp = m_S.adjoint();
sliceMaddTimer.Start(); sliceMaddTimer.Start();
sliceMaddMatrix(D,m_tmp,D,Q,Orthog); sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
sliceMaddTimer.Stop(); sliceMaddTimer.Stop();
@@ -326,17 +317,163 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
IterationsToComplete = k; IterationsToComplete = k;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_alpha = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_beta = Eigen::MatrixXcd::Zero(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
/************************************************************************
* Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
************************************************************************
* O'Leary : R = B - A X
* O'Leary : P = M R ; preconditioner M = 1
* O'Leary : alpha = PAP^{-1} RMR
* O'Leary : beta = RMR^{-1}_old RMR_new
* O'Leary : X=X+Palpha
* O'Leary : R_new=R_old-AP alpha
* O'Leary : P=MR_new+P beta
*/
R = Src - AP;
P = R;
sliceInnerProductMatrix(m_rr,R,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
m_pAp_inv = m_pAp.inverse();
m_alpha = m_pAp_inv * m_rr ;
// Psi, R update
sliceMaddTimer.Start();
sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddMatrix(R ,m_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
m_rr_inv = m_rr.inverse();
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_rr,R,R,Orthog);
sliceInnerTimer.Stop();
m_beta = m_rr_inv *m_rr;
// Search update
sliceMaddTimer.Start();
sliceMaddMatrix(AP,m_beta,P,R,Orthog);
sliceMaddTimer.Stop();
P= AP;
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
RealD rr;
for(int b=0;b<Nblock;b++){
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
// multiRHS conjugate gradient. Dimension zero should be the block direction // multiRHS conjugate gradient. Dimension zero should be the block direction
// Use this for spread out across nodes // Use this for spread out across nodes
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{ {
int Orthog = blockDim; // First dimension is block dim int Orthog = blockDim; // First dimension is block dim
Nblock = Src.Grid()->_fdimensions[Orthog]; Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.Checkerboard() = Src.Checkerboard(); Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src); conformable(Psi, Src);
Field P(Src); Field P(Src);
@@ -463,232 +600,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
IterationsToComplete = k; IterationsToComplete = k;
} }
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation:
//--------------------------
// X is guess/Solution
// B is RHS
// Solve A X_i = B_i ; i refers to Nblock index
////////////////////////////////////////////////////////////////////////////
void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X)
{
Nblock = B.size();
assert(Nblock == X.size());
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
for(int b=0;b<Nblock;b++){
X[b].Checkerboard() = B[b].Checkerboard();
conformable(X[b], B[b]);
conformable(X[b], X[0]);
}
Field Fake(B[0]);
std::vector<Field> tmp(Nblock,Fake);
std::vector<Field> Q(Nblock,Fake);
std::vector<Field> D(Nblock,Fake);
std::vector<Field> Z(Nblock,Fake);
std::vector<Field> AD(Nblock,Fake);
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
************************************************************************
* Dimensions:
*
* X,B==(Nferm x Nblock)
* A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
*
* QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
* for k:
* Z = AD
* M = [D^dag Z]^{-1}
* X = X + D MC
* QS = Q - ZM
* D = Q + D S^dag
* C = S C
*/
///////////////////////////////////////
// Initial block: initial search dir is guess
///////////////////////////////////////
std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
//1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b];
}
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
for(int b=0;b<Nblock;b++) D[b]=Q[b];
std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
//3. Z = AD
MatrixTimer.Start();
for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);
MatrixTimer.Stop();
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
InnerProductMatrix(m_DZ,D,Z);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//5. X = X + D MC
m_tmp = m_M * m_C;
sliceMaddTimer.Start();
MaddMatrix(X,m_tmp, D,X);
sliceMaddTimer.Stop();
//6. QS = Q - ZM
sliceMaddTimer.Start();
MaddMatrix(tmp,m_M,Z,Q,-1.0);
sliceMaddTimer.Stop();
QRTimer.Start();
ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
QRTimer.Stop();
//7. D = Q + D S^dag
m_tmp = m_S.adjoint();
sliceMaddTimer.Start();
MaddMatrix(D,m_tmp,D,Q);
sliceMaddTimer.Stop();
//8. C = S C
m_C = m_S*m_C;
/*********************
* convergence monitor
*********************
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
RealD rrsum=0;
RealD rr;
for(int b=0;b<Nblock;b++) {
rrsum+=real(m_rr(b,b));
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
}; };
NAMESPACE_END(Grid); }
#endif

View File

@@ -1,248 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the CAGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
CommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: src " << ssq << std::endl;
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
ComplexD scale = 1.0/gamma[0];
v[0] = scale * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(v, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
MatrixTimer.Start();
LinOp.Op(v[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + v[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -31,7 +31,7 @@ directory
#ifndef GRID_CONJUGATE_GRADIENT_H #ifndef GRID_CONJUGATE_GRADIENT_H
#define GRID_CONJUGATE_GRADIENT_H #define GRID_CONJUGATE_GRADIENT_H
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@@ -41,9 +41,6 @@ NAMESPACE_BEGIN(Grid);
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
@@ -57,12 +54,11 @@ public:
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
psi.checkerboard = src.checkerboard;
conformable(psi, src); conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq; RealD cp, c, a, d, b, ssq, qq, b_pred;
//RealD b_pred;
Field p(src); Field p(src);
Field mmp(src); Field mmp(src);
@@ -72,6 +68,7 @@ public:
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b); Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
@@ -92,8 +89,6 @@ public:
// Check if guess is really REALLY good :) // Check if guess is really REALLY good :)
if (cp <= rsq) { if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
IterationsToComplete = 0;
return; return;
} }
@@ -109,7 +104,7 @@ public:
SolverTimer.Start(); SolverTimer.Start();
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations*1000; k++) {
c = cp; c = cp;
MatrixTimer.Start(); MatrixTimer.Start();
@@ -130,18 +125,15 @@ public:
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
auto psi_v = psi.View(); parallel_for(int ss=0;ss<src._grid->oSites();ss++){
auto p_v = p.View(); vstream(psi[ss], a * p[ss] + psi[ss]);
auto r_v = r.View(); vstream(p [ss], b * p[ss] + r[ss]);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ }
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; << " residual " << cp << " target " << rsq << std::endl;
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
@@ -149,22 +141,22 @@ public:
Linop.HermOpAndNorm(psi, mmp, d, qq); Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src; p = mmp - src;
RealD srcnorm = std::sqrt(norm2(src)); RealD srcnorm = sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p)); RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
<< "\tComputed residual " << std::sqrt(cp / ssq) std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
<< "\tTrue residual " << true_residual std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
<< "\tTarget " << Tolerance << std::endl; std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogIterative << "Time breakdown "<<std::endl; std::cout << GridLogPerformance << "Time breakdown "<<std::endl;
std::cout << GridLogIterative << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tInner " << InnerTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -173,12 +165,13 @@ public:
return; return;
} }
} }
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl; std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -28,12 +28,10 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
NAMESPACE_BEGIN(Grid); namespace Grid {
//Mixed precision restarted defect correction CG //Mixed precision restarted defect correction CG
template<class FieldD,class FieldF, template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> { class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public: public:
RealD Tolerance; RealD Tolerance;
@@ -52,12 +50,7 @@ NAMESPACE_BEGIN(Grid);
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser; LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradient(RealD tol, MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
Integer maxinnerit,
Integer maxouterit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d), Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL){ }; OuterLoopNormMult(100.), guesser(NULL){ };
@@ -72,18 +65,18 @@ NAMESPACE_BEGIN(Grid);
GridStopWatch TotalTimer; GridStopWatch TotalTimer;
TotalTimer.Start(); TotalTimer.Start();
int cb = src_d_in.Checkerboard(); int cb = src_d_in.checkerboard;
sol_d.Checkerboard() = cb; sol_d.checkerboard = cb;
RealD src_norm = norm2(src_d_in); RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance; RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid(); GridBase* DoublePrecGrid = src_d_in._grid;
FieldD tmp_d(DoublePrecGrid); FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb; tmp_d.checkerboard = cb;
FieldD tmp2_d(DoublePrecGrid); FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb; tmp2_d.checkerboard = cb;
FieldD src_d(DoublePrecGrid); FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation src_d = src_d_in; //source for next inner iteration, computed from residual during operation
@@ -91,10 +84,10 @@ NAMESPACE_BEGIN(Grid);
RealD inner_tol = InnerTolerance; RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid); FieldF src_f(SinglePrecGrid);
src_f.Checkerboard() = cb; src_f.checkerboard = cb;
FieldF sol_f(SinglePrecGrid); FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb; sol_f.checkerboard = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false; CG_f.ErrorOnNoConverge = false;
@@ -122,7 +115,7 @@ NAMESPACE_BEGIN(Grid);
precisionChange(src_f, src_d); precisionChange(src_f, src_d);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
sol_f = Zero(); zeroit(sol_f);
//Optionally improve inner solver guess (eg using known eigenvectors) //Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) if(guesser != NULL)
@@ -156,6 +149,6 @@ NAMESPACE_BEGIN(Grid);
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H #ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H #define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@@ -41,9 +41,6 @@ class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field> public OperatorFunction<Field>
{ {
public: public:
using OperatorFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
@@ -59,7 +56,7 @@ public:
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{ {
GridBase *grid = src.Grid(); GridBase *grid = src._grid;
int nshift = shifts.order; int nshift = shifts.order;
std::vector<Field> results(nshift,grid); std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi); (*this)(Linop,src,results,psi);
@@ -81,7 +78,7 @@ public:
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi) void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{ {
GridBase *grid = src.Grid(); GridBase *grid = src._grid;
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction" // Convenience references to the info stored in "MultiShiftFunction"
@@ -321,5 +318,5 @@ public:
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -28,11 +28,9 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H #ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H #define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class FieldD,class FieldF, template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public: public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
@@ -76,7 +74,7 @@ public:
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f; LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false; bool using_fallback = false;
psi.Checkerboard() = src.Checkerboard(); psi.checkerboard = src.checkerboard;
conformable(psi, src); conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred; RealD cp, c, a, d, b, ssq, qq, b_pred;
@@ -110,17 +108,17 @@ public:
// Check if guess is really REALLY good :) // Check if guess is really REALLY good :)
if (cp <= rsq) { if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n"; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl; std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
return; return;
} }
//Single prec initialization //Single prec initialization
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard(); r_f.checkerboard = r.checkerboard;
precisionChange(r_f, r); precisionChange(r_f, r);
FieldF psi_f(r_f); FieldF psi_f(r_f);
psi_f = Zero(); psi_f = zero;
FieldF p_f(r_f); FieldF p_f(r_f);
FieldF mmp_f(r_f); FieldF mmp_f(r_f);
@@ -180,12 +178,12 @@ public:
Linop_d.HermOpAndNorm(psi, mmp, d, qq); Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src; p = mmp - src;
RealD srcnorm = std::sqrt(norm2(src)); RealD srcnorm = sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p)); RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl; std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
@@ -219,7 +217,7 @@ public:
Linop_d.HermOpAndNorm(psi, mmp, d, qq); Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp; r = src - mmp;
psi_f = Zero(); psi_f = zero;
precisionChange(r_f, r); precisionChange(r_f, r);
cp = norm2(r); cp = norm2(r);
MaxResidSinceLastRelUp = cp; MaxResidSinceLastRelUp = cp;
@@ -251,7 +249,7 @@ public:
}; };
NAMESPACE_END(Grid); };

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CONJUGATE_RESIDUAL_H #ifndef GRID_CONJUGATE_RESIDUAL_H
#define GRID_CONJUGATE_RESIDUAL_H #define GRID_CONJUGATE_RESIDUAL_H
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@@ -39,8 +39,6 @@ NAMESPACE_BEGIN(Grid);
template<class Field> template<class Field>
class ConjugateResidual : public OperatorFunction<Field> { class ConjugateResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;
@@ -51,14 +49,14 @@ public:
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b; // c, d; RealD a, b, c, d;
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp; RealD rAr, rAAr, rArp;
RealD pAp, pAAp; RealD pAp, pAAp;
GridBase *grid = src.Grid(); GridBase *grid = src._grid;
psi=Zero(); psi=zero;
Field r(grid), p(grid), Ap(grid), Ar(grid); Field r(grid), p(grid), Ap(grid), Ar(grid);
r=src; r=src;
@@ -97,8 +95,8 @@ public:
axpy(r,-1.0,src,Ap); axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq; RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<std::sqrt(cp/ssq) << " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<std::sqrt(true_resid) << " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl; << " target " <<Tolerance <<std::endl;
return; return;
} }
@@ -109,5 +107,5 @@ public:
assert(0); assert(0);
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -33,13 +33,9 @@ namespace Grid {
template<class Field> template<class Field>
class ZeroGuesser: public LinearFunction<Field> { class ZeroGuesser: public LinearFunction<Field> {
public: public:
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); }; virtual void operator()(const Field &src, Field &guess) { guess = zero; };
};
template<class Field>
class DoNothingGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { };
}; };
template<class Field> template<class Field>
class SourceGuesser: public LinearFunction<Field> { class SourceGuesser: public LinearFunction<Field> {
public: public:
@@ -60,14 +56,14 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
guess = Zero(); guess = zero;
assert(evec.size()==eval.size()); assert(evec.size()==eval.size());
auto N = evec.size(); auto N = evec.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const Field& tmp = evec[i]; const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
} }
guess.Checkerboard() = src.Checkerboard(); guess.checkerboard = src.checkerboard;
} }
}; };
@@ -90,15 +86,15 @@ public:
void operator()(const FineField &src,FineField &guess) { void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size(); int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0].Grid()); CoarseField src_coarse(evec_coarse[0]._grid);
CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero(); CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
blockProject(src_coarse,src,subspace); blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i]; const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse); axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
} }
blockPromote(guess_coarse,guess,subspace); blockPromote(guess_coarse,guess,subspace);
guess.Checkerboard() = src.Checkerboard(); guess.checkerboard = src.checkerboard;
}; };
}; };

View File

@@ -1,258 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit,
LinearFunction<Field> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
PrecTimer.Start();
Preconditioner(v[iter], z[iter]);
PrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -1,256 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the FGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
FlexibleGeneralisedMinimalResidual(RealD tol,
Integer maxit,
LinearFunction<Field> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
PrecTimer.Start();
Preconditioner(v[iter], z[iter]);
PrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -1,244 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the GMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GeneralisedMinimalResidual(RealD tol,
Integer maxit,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: src " << ssq << std::endl;
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(v, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
MatrixTimer.Start();
LinOp.Op(v[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + v[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -35,7 +35,7 @@ Author: Christoph Lehner <clehner@bnl.gov>
//#include <zlib.h> //#include <zlib.h>
#include <sys/stat.h> #include <sys/stat.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h // Move following 100 LOC to lattice/Lattice_basis.h
@@ -43,11 +43,6 @@ NAMESPACE_BEGIN(Grid);
template<class Field> template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k) void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{ {
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){ for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w); auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j]; w = w - ip*basis[j];
@@ -57,118 +52,44 @@ void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
template<class Field> template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
Vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0]._grid;
for(int k=0;k<basis.size();k++){ parallel_region
basis_v[k] = basis[k].View();
}
#if 0
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
thread_region
{ {
vobj* B = Bt.data() + Nm * thread_num();
thread_for_in_region(ss, grid->oSites(),{ std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.; for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){ for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss]; B[j] +=Qt(j,k) * basis[k]._odata[ss];
} }
} }
for(int j=j0; j<j1; ++j){ for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j]; basis[j]._odata[ss] = B[j];
}
});
}
#else
int nrot = j1-j0;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
// printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of Eigen matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
for(int k=0;k<Nm;++k){
for(int j=0;j<Nm;++j){
Qt_p[j*Nm+k]=Qt(j,k);
} }
} }
// Block the loop to keep storage footprint down
vobj zz=Zero();
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
auto z=coalescedRead(zz);
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
} }
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
} }
// Extract a single rotated vector // Extract a single rotated vector
template<class Field> template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].View()) View;
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0]._grid;
result.Checkerboard() = basis[0].Checkerboard(); result.checkerboard = basis[0].checkerboard;
auto result_v=result.View(); parallel_for(int ss=0;ss < grid->oSites();ss++){
Vector<View> basis_v(basis.size(),result_v); vobj B = zero;
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]); B +=Qt(j,k) * basis[k]._odata[ss];
}
result._odata[ss] = B;
} }
coalescedWrite(result_v[ss], B);
});
} }
template<class Field> template<class Field>
@@ -198,7 +119,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i); assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]); std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i]; idx[j] = idx[i];
@@ -229,19 +150,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
basisReorderInPlace(_v,sort_vals,idx); basisReorderInPlace(_v,sort_vals,idx);
} }
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted lanczos // Implicitly restarted lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@@ -351,7 +259,7 @@ public:
RealD _eresid, // resid in lmdue deficit RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=0, int _orth_period = 1, int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester), SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@@ -367,7 +275,7 @@ public:
RealD _eresid, // resid in lmdue deficit RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=0, int _orth_period = 1, int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) : IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester), SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm), Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
@@ -381,7 +289,7 @@ public:
template<typename T> static RealD normalise(T& v) template<typename T> static RealD normalise(T& v)
{ {
RealD nn = norm2(v); RealD nn = norm2(v);
nn = std::sqrt(nn); nn = sqrt(nn);
v = v * (1.0/nn); v = v * (1.0/nn);
return nn; return nn;
} }
@@ -413,10 +321,10 @@ until convergence
*/ */
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false) void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{ {
GridBase *grid = src.Grid(); GridBase *grid = src._grid;
assert(grid == evec[0].Grid()); assert(grid == evec[0]._grid);
// GridLogIRL.TimingMode(1); GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl; std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -441,17 +349,14 @@ until convergence
{ {
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
std::cout << GridLogIRL << " IRL source norm " << norm2(src) << std::endl;
const int _MAX_ITER_IRL_MEVAPP_ = 50; const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) { for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
normalise(src_n); normalise(src_n);
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.0001) if (fabs(evalMaxApprox/na - 1.0) < 0.05)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
@@ -541,7 +446,7 @@ until convergence
assert(k2<Nm); assert(k2<Nm); assert(k1>0); assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl; std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Compressed vector f and beta(k2) // Compressed vector f and beta(k2)
@@ -549,7 +454,7 @@ until convergence
f *= Qt(k2-1,Nm-1); f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2]; f += lme[k2-1] * evec[k2];
beta_k = norm2(f); beta_k = norm2(f);
beta_k = std::sqrt(beta_k); beta_k = sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl; std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k; RealD betar = 1.0/beta_k;
@@ -572,7 +477,7 @@ until convergence
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl; std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.Checkerboard() = evec[0].Checkerboard(); Field B(grid); B.checkerboard = evec[0].checkerboard;
// power of two search pattern; not every evalue in eval2 is assessed. // power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1; int allconv =1;
@@ -610,7 +515,7 @@ until convergence
converged: converged:
{ {
Field B(grid); B.Checkerboard() = evec[0].Checkerboard(); Field B(grid); B.checkerboard = evec[0].checkerboard;
basisRotate(evec,Qt,0,Nk,0,Nk,Nm); basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl; std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0; Nconv=0;
@@ -649,11 +554,11 @@ until convergence
/* Saad PP. 195 /* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0 1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do: 2. For k = 1,2,...,m Do:
3. wk:=Avk - b_k v_{k-1} 3. wk:=Avkβkv_{k1}
4. ak:=(wk,vk) // 4. αk:=(wk,vk) //
5. wk:=wk-akvk // wk orthog vk 5. wk:=wkαkvk // wk orthog vk
6. bk+1 := ||wk||_2. If b_k+1 = 0 then Stop 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
7. vk+1 := wk/b_k+1 7. vk+1 := wk/βk+1
8. EndDo 8. EndDo
*/ */
void step(std::vector<RealD>& lmd, void step(std::vector<RealD>& lmd,
@@ -661,7 +566,6 @@ until convergence
std::vector<Field>& evec, std::vector<Field>& evec,
Field& w,int Nm,int k) Field& w,int Nm,int k)
{ {
std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20; const RealD tiny = 1.0e-20;
assert( k< Nm ); assert( k< Nm );
@@ -673,20 +577,20 @@ until convergence
if(k>0) w -= lme[k-1] * evec[k-1]; if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec_k,w); ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
RealD alph = real(zalph); RealD alph = real(zalph);
w = w - alph * evec_k; w = w - alph * evec_k;// 5. wk:=wkαkvk
RealD beta = normalise(w); RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
lmd[k] = alph; lmd[k] = alph;
lme[k] = beta; lme[k] = beta;
if ( (k>0) && ( (k % orth_period) == 0 )) { if (k>0 && k % orth_period == 0) {
std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
orthogonalize(w,evec,k); // orthonormalise orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl; std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
} }
if(k < Nm-1) evec[k+1] = w; if(k < Nm-1) evec[k+1] = w;
@@ -694,8 +598,6 @@ until convergence
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl; std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny ) if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl; std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
} }
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
@@ -905,7 +807,7 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
// determination of 2x2 leading submatrix // determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2]; RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]); RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub))); RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift) // (Dsh: shift)
@@ -936,6 +838,5 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
abort(); abort();
} }
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -29,7 +29,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LOCAL_COHERENCE_IRL_H #ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H #define GRID_LOCAL_COHERENCE_IRL_H
NAMESPACE_BEGIN(Grid); namespace Grid {
struct LanczosParams : Serializable { struct LanczosParams : Serializable {
public: public:
@@ -58,7 +59,7 @@ public:
RealD , coarse_relax_tol, RealD , coarse_relax_tol,
std::vector<int>, blockSize, std::vector<int>, blockSize,
std::string, config, std::string, config,
std::vector < ComplexD >, omega, std::vector < std::complex<double> >, omega,
RealD, mass, RealD, mass,
RealD, M5); RealD, M5);
}; };
@@ -82,11 +83,11 @@ public:
}; };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0].Grid(); GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].Checkerboard(); int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.Checkerboard()= checkerboard; FineField fin (FineGrid); fin.checkerboard= checkerboard;
FineField fout(FineGrid); fout.Checkerboard() = checkerboard; FineField fout(FineGrid); fout.checkerboard = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl; blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl; _Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
@@ -117,11 +118,11 @@ public:
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0].Grid(); GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].Checkerboard(); int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.Checkerboard() =checkerboard; FineField fin (FineGrid); fin.checkerboard =checkerboard;
FineField fout(FineGrid);fout.Checkerboard() =checkerboard; FineField fout(FineGrid);fout.checkerboard =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl; blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl; _poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
@@ -181,10 +182,10 @@ public:
} }
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
GridBase *FineGrid = _subspace[0].Grid(); GridBase *FineGrid = _subspace[0]._grid;
int checkerboard = _subspace[0].Checkerboard(); int checkerboard = _subspace[0].checkerboard;
FineField fB(FineGrid);fB.Checkerboard() =checkerboard; FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.Checkerboard() =checkerboard; FineField fv(FineGrid);fv.checkerboard =checkerboard;
blockPromote(B,fv,_subspace); blockPromote(B,fv,_subspace);
@@ -304,11 +305,11 @@ public:
int Nk = nbasis; int Nk = nbasis;
subspace.resize(Nk,_FineGrid); subspace.resize(Nk,_FineGrid);
subspace[0]=1.0; subspace[0]=1.0;
subspace[0].Checkerboard()=_checkerboard; subspace[0].checkerboard=_checkerboard;
normalise(subspace[0]); normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){ for(int k=1;k<Nk;k++){
subspace[k].Checkerboard()=_checkerboard; subspace[k].checkerboard=_checkerboard;
Op(subspace[k-1],subspace[k]); Op(subspace[k-1],subspace[k]);
normalise(subspace[k]); normalise(subspace[k]);
} }
@@ -359,11 +360,7 @@ public:
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
typedef typename FineField::scalar_type Scalar;
// src=1.0;
src=Scalar(1.0);
src.Checkerboard() = _checkerboard;
int Nconv; int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false); IRL.calc(evals_fine,subspace,src,Nconv,false);
@@ -405,5 +402,5 @@ public:
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -1,157 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MINIMAL_RESIDUAL_H
#define GRID_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
RealD overRelaxParam;
Integer IterationsToComplete; // Number of iterations the MR took to finish.
// Filled in upon completion
MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
: Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
ComplexD a, c;
RealD d;
Field Mr(src);
Field r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Linop.Op(psi, Mr);
r = src - Mr;
RealD cp = norm2(r);
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl;
std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl;
if (cp <= rsq) {
return;
}
std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
MatrixTimer.Start();
Linop.Op(r, Mr);
MatrixTimer.Stop();
LinalgTimer.Start();
c = innerProduct(Mr, r);
d = norm2(Mr);
a = c / d;
a = a * overRelaxParam;
psi = psi + r * a;
r = r - Mr * a;
cp = norm2(r);
LinalgTimer.Stop();
std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
Linop.Op(psi, Mr);
r = src - Mr;
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "MinimalResidual Converged on iteration " << k
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge)
assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "MinimalResidual did NOT converge"
<< std::endl;
if (ErrorOnNoConverge)
assert(0);
IterationsToComplete = k;
}
};
} // namespace Grid
#endif

View File

@@ -1,276 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
public:
using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
GridStopWatch ChangePrecTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GridBase* SinglePrecGrid;
LinearFunction<FieldF> &Preconditioner;
MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD tol,
Integer maxit,
GridBase * sp_grid,
LinearFunction<FieldF> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, SinglePrecGrid(sp_grid)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
FieldD r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
std::cout << GridLogIterative << "MPFGMRES: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
ChangePrecTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "MPFGMRES: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " << ChangePrecTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
RealD cp = 0;
FieldD w(src.Grid());
FieldD r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<FieldD> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<FieldD> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
FieldF v_f(SinglePrecGrid);
FieldF z_f(SinglePrecGrid);
ChangePrecTimer.Start();
precisionChange(v_f, v[iter]);
precisionChange(z_f, z[iter]);
ChangePrecTimer.Stop();
PrecTimer.Start();
Preconditioner(v_f, z_f);
PrecTimer.Stop();
ChangePrecTimer.Start();
precisionChange(z[iter], z_f);
ChangePrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -28,85 +28,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_NORMAL_EQUATIONS_H #ifndef GRID_NORMAL_EQUATIONS_H
#define GRID_NORMAL_EQUATIONS_H #define GRID_NORMAL_EQUATIONS_H
NAMESPACE_BEGIN(Grid); namespace Grid {
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver // Take a matrix and form an NE solver calling a Herm solver
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations { template<class Field> class NormalEquations : public OperatorFunction<Field>{
private: private:
SparseMatrixBase<Field> & _Matrix; SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public: public:
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Wrap the usual normal equations trick // Wrap the usual normal equations trick
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver, NormalEquations(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver)
LinearFunction<Field> &Guess) : _Matrix(Matrix), _HermitianSolver(HermitianSolver) {};
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){ void operator() (const Field &in, Field &out){
Field src(in.Grid()); Field src(in._grid);
Field tmp(in.Grid());
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Matrix.Mdag(in,src); _Matrix.Mdag(in,src);
_Guess(src,out); _HermitianSolver(src,out); // Mdag M out = Mdag in
_HermitianSolver(MdagMOp,src,out); // Mdag M out = Mdag in
} }
}; };
template<class Field> class HPDSolver {
private:
LinearOperatorBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
HPDSolver(LinearOperatorBase<Field> &Matrix,
OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
_Guess(in,out);
_HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in
} }
};
template<class Field> class MdagMSolver {
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
MdagMSolver(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
MdagMLinearOperator<SparseMatrixBase<Field>,Field> MdagMOp(_Matrix);
_Guess(in,out);
_HermitianSolver(MdagMOp,in,out); // Mdag M out = Mdag in
}
};
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -1,45 +0,0 @@
#pragma once
namespace Grid {
template<class Field> class PowerMethod
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_EST_ = 50;
for (int i=0;i<_MAX_ITER_EST_;i++) {
normalise(src_n);
HermOp.HermOp(src_n,tmp);
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox;
}
evalMaxApprox = na;
src_n = tmp;
}
assert(0);
return 0;
}
};
}

View File

@@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
#define GRID_PREC_CONJUGATE_RESIDUAL_H #define GRID_PREC_CONJUGATE_RESIDUAL_H
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@@ -56,7 +56,7 @@ public:
RealD rAr, rAAr, rArp; RealD rAr, rAAr, rArp;
RealD pAp, pAAp; RealD pAp, pAAp;
GridBase *grid = src.Grid(); GridBase *grid = src._grid;
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid); Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero; psi=zero;
@@ -115,5 +115,5 @@ public:
assert(0); assert(0);
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -36,50 +36,41 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
//NB. Likely not original reference since they are focussing on a preconditioner variant. //NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper // but VPGCR was nicely written up in their paper
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid); namespace Grid {
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field> template<class Field>
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> { class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public: public:
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;
int mmax; int mmax;
int nstep; int nstep;
int steps; int steps;
int level;
GridStopWatch PrecTimer; GridStopWatch PrecTimer;
GridStopWatch MatTimer; GridStopWatch MatTimer;
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner; LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; }; PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol), Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec), Preconditioner(Prec),
mmax(_mmax), mmax(_mmax),
nstep(_nstep) nstep(_nstep)
{ {
level=1;
verbose=1; verbose=1;
}; };
void operator() (const Field &src, Field &psi){ void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
psi=Zero(); psi=zero;
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid()); Field r(src._grid);
PrecTimer.Reset(); PrecTimer.Reset();
MatTimer.Reset(); MatTimer.Reset();
@@ -91,9 +82,9 @@ public:
steps=0; steps=0;
for(int k=0;k<MaxIterations;k++){ for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq); cp=GCRnStep(Linop,src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl; std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
if(cp<rsq) { if(cp<rsq) {
@@ -102,33 +93,31 @@ public:
Linop.HermOp(psi,r); Linop.HermOp(psi,r);
axpy(r,-1.0,src,r); axpy(r,-1.0,src,r);
RealD tr = norm2(r); RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq) << " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq) << " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl; << " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
/* std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
*/
return; return;
} }
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0); assert(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
RealD cp; RealD cp;
RealD a, b; RealD a, b, c, d;
RealD zAz, zAAz; RealD zAz, zAAz;
RealD rq; RealD rAq, rq;
GridBase *grid = src.Grid(); GridBase *grid = src._grid;
Field r(grid); Field r(grid);
Field z(grid); Field z(grid);
@@ -143,8 +132,6 @@ public:
std::vector<Field> p(mmax,grid); std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax); std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
////////////////////////////////// //////////////////////////////////
// initial guess x0 is taken as nonzero. // initial guess x0 is taken as nonzero.
// r0=src-A x0 = src // r0=src-A x0 = src
@@ -152,26 +139,32 @@ public:
MatTimer.Start(); MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz); Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop(); MatTimer.Stop();
LinalgTimer.Start();
r=src-Az; r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
///////////////////// /////////////////////
// p = Prec(r) // p = Prec(r)
///////////////////// /////////////////////
PrecTimer.Start(); PrecTimer.Start();
Preconditioner(r,z); Preconditioner(r,z);
PrecTimer.Stop(); PrecTimer.Stop();
MatTimer.Start(); MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz); Linop.HermOp(z,tmp);
MatTimer.Stop(); MatTimer.Stop();
LinalgTimer.Start(); ttmp=tmp;
tmp=tmp-r;
/*
std::cout<<GridLogMessage<<r<<std::endl;
std::cout<<GridLogMessage<<z<<std::endl;
std::cout<<GridLogMessage<<ttmp<<std::endl;
std::cout<<GridLogMessage<<tmp<<std::endl;
*/
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
//p[0],q[0],qq[0] //p[0],q[0],qq[0]
p[0]= z; p[0]= z;
@@ -179,7 +172,6 @@ public:
qq[0]= zAAz; qq[0]= zAAz;
cp =norm2(r); cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){ for(int k=0;k<nstep;k++){
@@ -189,21 +181,18 @@ public:
int peri_k = k %mmax; int peri_k = k %mmax;
int peri_kp= kp%mmax; int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real? rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k]; a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi); axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r); cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){ if((k==nstep-1)||(cp<rsq)){
return cp; return cp;
} }
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start(); PrecTimer.Start();
Preconditioner(r,z);// solve Az = r Preconditioner(r,z);// solve Az = r
@@ -211,9 +200,10 @@ public:
MatTimer.Start(); MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz); Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop(); MatTimer.Stop();
tmp=tmp-r;
LinalgTimer.Start(); std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[peri_kp]=Az; q[peri_kp]=Az;
p[peri_kp]=z; p[peri_kp]=z;
@@ -229,11 +219,12 @@ public:
} }
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
} }
assert(0); // never reached assert(0); // never reached
return cp; return cp;
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -1,371 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithmsf/iterative/QuasiMinimalResidual.h
Copyright (C) 2019
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
RealD innerG5ProductReal(Field &l, Field &r)
{
Gamma G5(Gamma::Algebra::Gamma5);
Field tmp(l.Grid());
// tmp = G5*r;
G5R5(tmp,r);
ComplexD ip =innerProduct(l,tmp);
std::cout << "innerProductRealG5R5 "<<ip<<std::endl;
return ip.real();
}
template<class Field>
class QuasiMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge;
RealD Tolerance;
Integer MaxIterations;
Integer IterationCount;
QuasiMinimalResidual(RealD tol,
Integer maxit,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, ErrorOnNoConverge(err_on_no_conv)
{};
#if 1
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
RealD resid;
IterationCount=0;
RealD rho, rho_1, xi, gamma, gamma_1, theta, theta_1;
RealD eta, delta, ep, beta;
GridBase *Grid = b.Grid();
Field r(Grid), d(Grid), s(Grid);
Field v(Grid), w(Grid), y(Grid), z(Grid);
Field v_tld(Grid), w_tld(Grid), y_tld(Grid), z_tld(Grid);
Field p(Grid), q(Grid), p_tld(Grid);
Real normb = norm2(b);
LinOp.Op(x,r); r = b - r;
assert(normb> 0.0);
resid = norm2(r)/normb;
if (resid <= Tolerance) {
return;
}
v_tld = r;
y = v_tld;
rho = norm2(y);
// Take Gamma5 conjugate
// Gamma G5(Gamma::Algebra::Gamma5);
// G5R5(w_tld,r);
// w_tld = G5* v_tld;
w_tld=v_tld;
z = w_tld;
xi = norm2(z);
gamma = 1.0;
eta = -1.0;
theta = 0.0;
for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests
assert( rho != 0.0);
assert( xi != 0.0);
v = (1. / rho) * v_tld;
y = (1. / rho) * y;
w = (1. / xi) * w_tld;
z = (1. / xi) * z;
ComplexD Zdelta = innerProduct(z, y); // Complex?
std::cout << "Zdelta "<<Zdelta<<std::endl;
delta = Zdelta.real();
y_tld = y;
z_tld = z;
if (i > 1) {
p = y_tld - (xi * delta / ep) * p;
q = z_tld - (rho * delta / ep) * q;
} else {
p = y_tld;
q = z_tld;
}
LinOp.Op(p,p_tld); // p_tld = A * p;
ComplexD Zep = innerProduct(q, p_tld);
ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit
assert(abs(ep)>0);
beta = ep / delta;
assert(abs(beta)>0);
v_tld = p_tld - beta * v;
y = v_tld;
rho_1 = rho;
rho = norm2(y);
LinOp.AdjOp(q,w_tld);
w_tld = w_tld - beta * w;
z = w_tld;
xi = norm2(z);
gamma_1 = gamma;
theta_1 = theta;
theta = rho / (gamma_1 * beta);
gamma = 1.0 / sqrt(1.0 + theta * theta);
std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl;
assert(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
if (i > 1) {
d = eta * p + (theta_1 * theta_1 * gamma * gamma) * d;
s = eta * p_tld + (theta_1 * theta_1 * gamma * gamma) * s;
} else {
d = eta * p;
s = eta * p_tld;
}
x =x+d; // update approximation vector
r =r-s; // compute residual
if ((resid = norm2(r) / normb) <= Tolerance) {
return;
}
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
}
assert(0);
return; // no convergence
}
#else
// QMRg5 SMP thesis
void operator()(LinearOperatorBase<Field> &LinOp, const Field &b, Field &x)
{
// Real scalars
GridBase *grid = b.Grid();
Field r(grid);
Field p_m(grid), p_m_minus_1(grid), p_m_minus_2(grid);
Field v_m(grid), v_m_minus_1(grid), v_m_plus_1(grid);
Field tmp(grid);
RealD w;
RealD z1, z2;
RealD delta_m, delta_m_minus_1;
RealD c_m_plus_1, c_m, c_m_minus_1;
RealD s_m_plus_1, s_m, s_m_minus_1;
RealD alpha, beta, gamma, epsilon;
RealD mu, nu, rho, theta, xi, chi;
RealD mod2r, mod2b;
RealD tau2, target2;
mod2b=norm2(b);
/////////////////////////
// Initial residual
/////////////////////////
LinOp.Op(x,tmp);
r = b - tmp;
/////////////////////////
// \mu = \rho = |r_0|
/////////////////////////
mod2r = norm2(r);
rho = sqrt( mod2r);
mu=rho;
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
/////////////////////////
// Zero negative history
/////////////////////////
v_m_plus_1 = Zero();
v_m_minus_1 = Zero();
p_m_minus_1 = Zero();
p_m_minus_2 = Zero();
// v0
v_m = (1.0/rho)*r;
/////////////////////////
// Initial coeffs
/////////////////////////
delta_m_minus_1 = 1.0;
c_m_minus_1 = 1.0;
c_m = 1.0;
s_m_minus_1 = 0.0;
s_m = 0.0;
/////////////////////////
// Set up convergence check
/////////////////////////
tau2 = mod2r;
target2 = mod2b * Tolerance*Tolerance;
for(int iter = 0 ; iter < MaxIterations; iter++){
/////////////////////////
// \delta_m = (v_m, \gamma_5 v_m)
/////////////////////////
delta_m = innerG5ProductReal(v_m,v_m);
std::cout << "QuasiMinimalResidual delta_m "<< delta_m<<std::endl;
/////////////////////////
// tmp = A v_m
/////////////////////////
LinOp.Op(v_m,tmp);
/////////////////////////
// \alpha = (v_m, \gamma_5 temp) / \delta_m
/////////////////////////
alpha = innerG5ProductReal(v_m,tmp);
alpha = alpha/delta_m ;
std::cout << "QuasiMinimalResidual alpha "<< alpha<<std::endl;
/////////////////////////
// \beta = \rho \delta_m / \delta_{m-1}
/////////////////////////
beta = rho * delta_m / delta_m_minus_1;
std::cout << "QuasiMinimalResidual beta "<< beta<<std::endl;
/////////////////////////
// \tilde{v}_{m+1} = temp - \alpha v_m - \beta v_{m-1}
/////////////////////////
v_m_plus_1 = tmp - alpha*v_m - beta*v_m_minus_1;
///////////////////////////////
// \rho = || \tilde{v}_{m+1} ||
///////////////////////////////
rho = sqrt( norm2(v_m_plus_1) );
std::cout << "QuasiMinimalResidual rho "<< rho<<std::endl;
///////////////////////////////
// v_{m+1} = \tilde{v}_{m+1}
///////////////////////////////
v_m_plus_1 = (1.0 / rho) * v_m_plus_1;
////////////////////////////////
// QMR recurrence coefficients.
////////////////////////////////
theta = s_m_minus_1 * beta;
gamma = c_m_minus_1 * beta;
epsilon = c_m * gamma + s_m * alpha;
xi = -s_m * gamma + c_m * alpha;
nu = sqrt( xi*xi + rho*rho );
c_m_plus_1 = fabs(xi) / nu;
if ( xi == 0.0 ) {
s_m_plus_1 = 1.0;
} else {
s_m_plus_1 = c_m_plus_1 * rho / xi;
}
chi = c_m_plus_1 * xi + s_m_plus_1 * rho;
std::cout << "QuasiMinimalResidual coeffs "<< theta <<" "<<gamma<<" "<< epsilon<<" "<< xi<<" "<< nu<<std::endl;
std::cout << "QuasiMinimalResidual coeffs "<< chi <<std::endl;
////////////////////////////////
//p_m=(v_m - \epsilon p_{m-1} - \theta p_{m-2}) / \chi
////////////////////////////////
p_m = (1.0/chi) * v_m - (epsilon/chi) * p_m_minus_1 - (theta/chi) * p_m_minus_2;
////////////////////////////////////////////////////////////////
// \psi = \psi + c_{m+1} \mu p_m
////////////////////////////////////////////////////////////////
x = x + ( c_m_plus_1 * mu ) * p_m;
////////////////////////////////////////
//
////////////////////////////////////////
mu = -s_m_plus_1 * mu;
delta_m_minus_1 = delta_m;
c_m_minus_1 = c_m;
c_m = c_m_plus_1;
s_m_minus_1 = s_m;
s_m = s_m_plus_1;
////////////////////////////////////
// Could use pointer swizzle games.
////////////////////////////////////
v_m_minus_1 = v_m;
v_m = v_m_plus_1;
p_m_minus_2 = p_m_minus_1;
p_m_minus_1 = p_m;
/////////////////////////////////////
// Convergence checks
/////////////////////////////////////
z1 = RealD(iter+1.0);
z2 = z1 + 1.0;
tau2 = tau2 *( z2 / z1 ) * s_m * s_m;
std::cout << " QuasiMinimumResidual iteration "<< iter<<std::endl;
std::cout << " QuasiMinimumResidual tau bound "<< tau2<<std::endl;
// Compute true residual
mod2r = tau2;
if ( 1 || (tau2 < (100.0 * target2)) ) {
LinOp.Op(x,tmp);
r = b - tmp;
mod2r = norm2(r);
std::cout << " QuasiMinimumResidual true residual is "<< mod2r<<std::endl;
}
if ( mod2r < target2 ) {
std::cout << " QuasiMinimumResidual has converged"<<std::endl;
return;
}
}
}
#endif
};
NAMESPACE_END(Grid);

View File

@@ -86,26 +86,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*/ */
namespace Grid { namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Use base class to share code
///////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver // Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackBase { // Now make the norm reflect extra factor of Mee
protected: template<class Field> class SchurRedBlackStaggeredSolve {
typedef CheckerBoardedSparseMatrixBase<Field> Matrix; private:
OperatorFunction<Field> & _HermitianRBSolver; OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise; int CBfactorise;
bool subGuess; bool subGuess;
bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
public: public:
SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false, /////////////////////////////////////////////////////
const bool _solnAsInitGuess = false) : // Wrap the usual normal equations Schur trick
_HermitianRBSolver(HermitianRBSolver), /////////////////////////////////////////////////////
useSolnAsInitGuess(_solnAsInitGuess) SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{ {
CBfactorise=0; CBfactorise=0;
subtractGuess(initSubGuess); subtractGuess(initSubGuess);
@@ -119,90 +116,12 @@ namespace Grid {
return subGuess; return subGuess;
} }
///////////////////////////////////////////////////////////// template<class Matrix>
// Shared code
/////////////////////////////////////////////////////////////
void operator() (Matrix & _Matrix,const Field &in, Field &out){ void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess; ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess); (*this)(_Matrix,in,out,guess);
} }
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out) template<class Matrix, class Guesser>
{
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Guesser>
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
int nblock = in.size();
std::vector<Field> src_o(nblock,grid);
std::vector<Field> sol_o(nblock,grid);
std::vector<Field> guess_save;
Field resid(fgrid);
Field tmp(grid);
////////////////////////////////////////////////
// Prepare RedBlack source
////////////////////////////////////////////////
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
////////////////////////////////////////////////
// Make the guesses
////////////////////////////////////////////////
if ( subGuess ) guess_save.resize(nblock,grid);
for(int b=0;b<nblock;b++){
if(useSolnAsInitGuess) {
pickCheckerboard(Odd, sol_o[b], out[b]);
} else {
guess(src_o[b],sol_o[b]);
}
if ( subGuess ) {
guess_save[b] = sol_o[b];
}
}
//////////////////////////////////////////////////////////////
// Call the block solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
RedBlackSolve(_Matrix,src_o,sol_o);
////////////////////////////////////////////////
// A2A boolean behavioural control & reconstruct other checkerboard
////////////////////////////////////////////////
for(int b=0;b<nblock;b++) {
if (subGuess) sol_o[b] = sol_o[b] - guess_save[b];
///////// Needs even source //////////////
pickCheckerboard(Even,tmp,in[b]);
RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
/////////////////////////////////////////////////
// Check unprec residual if possible
/////////////////////////////////////////////////
if ( ! subGuess ) {
_Matrix.M(out[b],resid);
resid = resid-in[b];
RealD ns = norm2(in[b]);
RealD nr = norm2(resid);
std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
} else {
std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
}
}
}
template<class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){ void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function // FIXME CGdiagonalMee not implemented virtual function
@@ -210,42 +129,154 @@ namespace Grid {
GridBase *grid = _Matrix.RedBlackGrid(); GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid(); GridBase *fgrid= _Matrix.Grid();
Field resid(fgrid); SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_o(grid);
Field src_e(grid); Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid); Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
//////////////////////////////////////////////// std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
// RedBlack source pickCheckerboard(Even,src_e,in);
//////////////////////////////////////////////// pickCheckerboard(Odd ,src_o,in);
RedBlackSource(_Matrix,in,src_e,src_o); pickCheckerboard(Even,sol_e,out);
////////////////////////////////
// Construct the guess
////////////////////////////////
if(useSolnAsInitGuess) {
pickCheckerboard(Odd ,sol_o,out); pickCheckerboard(Odd ,sol_o,out);
} else { std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
guess(src_o,sol_o);
}
Field guess_save(grid); /////////////////////////////////////////////////////
guess_save = sol_o; // src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
//src_o = tmp; assert(src_o.checkerboard ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// Call the red-black solver // Call the red-black solver
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
RedBlackSolve(_Matrix,src_o,sol_o); std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
guess(src_o, sol_o);
//////////////////////////////////////////////// Mtmp = sol_o;
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called the Mpc solver" <<std::endl;
// Fionn A2A boolean behavioural control // Fionn A2A boolean behavioural control
//////////////////////////////////////////////// if (subGuess) sol_o = sol_o-Mtmp;
if (subGuess) sol_o= sol_o-guess_save;
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// RedBlack solution needs the even source // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
RedBlackSolution(_Matrix,sol_o,src_e,out); _Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagMooeeSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false) : _HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=cb;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
guess(src_o,sol_o);
Mtmp = sol_o;
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) sol_o = sol_o-Mtmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual // Verify the unprec residual
if ( ! subGuess ) { if ( ! subGuess ) {
@@ -254,233 +285,219 @@ namespace Grid {
RealD ns = norm2(in); RealD ns = norm2(in);
RealD nr = norm2(resid); RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl; std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else { } else {
std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl; std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
} }
} }
/////////////////////////////////////////////////////////////
// Override in derived.
/////////////////////////////////////////////////////////////
virtual void RedBlackSource (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) =0;
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) =0;
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) =0;
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)=0;
}; };
template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public: public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess)
{
}
//////////////////////////////////////////////////////
// Override RedBlack specialisation
//////////////////////////////////////////////////////
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
Field src_e(grid);
src_e = src_e_c; // Const correctness
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal has Mooee on it.
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
//////////////////////////////////////////////////////
// Override RedBlack specialisation
//////////////////////////////////////////////////////
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
Field src_e_i(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
//=> psi = MeeInv phi
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick // Wrap the usual normal equations Schur trick
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false, SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
const bool _solnAsInitGuess = false) _HermitianRBSolver(HermitianRBSolver)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{ {
CBfactorise = 0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix,class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid(); GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid(); GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid); Field tmp(grid);
Field Mtmp(grid); Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,src); pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,src); pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) //////////////////////////////////////////////////////////////
{ // Call the red-black solver
GridBase *grid = _Matrix.RedBlackGrid(); //////////////////////////////////////////////////////////////
GridBase *fgrid= _Matrix.Grid(); std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
Field sol_o_i(grid); guess(src_o,tmp);
Field tmp(grid); Mtmp = tmp;
Field sol_e(grid); _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
// Fionn A2A boolean behavioural control
//////////////////////////////////////////////// if (subGuess) tmp = tmp-Mtmp;
// MooeeInv due to pecond _Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
////////////////////////////////////////////////
_Matrix.MooeeInv(sol_o,tmp);
sol_o_i = tmp;
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even); src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even); setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd ); setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) // Verify the unprec residual
{ if ( ! subGuess ) {
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix); _Matrix.M(out,resid);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); resid = resid-in;
}; RealD ns = norm2(in);
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) RealD nr = norm2(resid);
{
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix); std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); } else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
} }
}; };
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoMixed {
private:
LinearFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
guess(src_o,tmp);
Mtmp = tmp;
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) tmp = tmp-Mtmp;
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
} }
#endif #endif

View File

@@ -1,25 +1,18 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <fcntl.h> #include <fcntl.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
MemoryStats *MemoryProfiler::stats = nullptr; MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false; bool MemoryProfiler::debug = false;
#ifdef GRID_NVCC
#define SMALL_LIMIT (0)
#else
#define SMALL_LIMIT (4096)
#endif
#ifdef POINTER_CACHE
int PointerCache::victim; int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) { void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < SMALL_LIMIT ) return ptr; if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
@@ -56,9 +49,9 @@ void *PointerCache::Insert(void *ptr,size_t bytes) {
void *PointerCache::Lookup(size_t bytes) { void *PointerCache::Lookup(size_t bytes) {
if (bytes < SMALL_LIMIT ) return NULL; if (bytes < 4096 ) return NULL;
#ifdef GRID_OMP #ifdef _OPENMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
@@ -70,7 +63,7 @@ void *PointerCache::Lookup(size_t bytes) {
} }
return NULL; return NULL;
} }
#endif
void check_huge_pages(void *Buf,uint64_t BYTES) void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
@@ -129,5 +122,4 @@ std::string sizeString(const size_t bytes)
return std::string(buf); return std::string(buf);
} }
NAMESPACE_END(Grid); }

View File

@@ -40,22 +40,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <mm_malloc.h> #include <mm_malloc.h>
#endif #endif
#define POINTER_CACHE namespace Grid {
#define GRID_ALLOC_ALIGN (2*1024*1024)
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#ifdef POINTER_CACHE
class PointerCache { class PointerCache {
private: private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
#ifdef GRID_NVCC
static const int Ncache=128;
#else
static const int Ncache=8; static const int Ncache=8;
#endif
static int victim; static int victim;
typedef struct { typedef struct {
@@ -68,11 +58,11 @@ private:
public: public:
static void *Insert(void *ptr,size_t bytes) ; static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ; static void *Lookup(size_t bytes) ;
}; };
#endif
std::string sizeString(size_t bytes); std::string sizeString(size_t bytes);
@@ -162,46 +152,29 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else // if ( ptr != NULL )
pointer ptr = nullptr; // std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
#endif
#ifdef GRID_NVCC //////////////////
//////////////////////////////////// // Hack 2MB align; could make option probably doesn't need configurability
// Unified (managed) memory //////////////////
//////////////////////////////////// //define GRID_ALLOC_ALIGN (128)
if ( ptr == (_Tp *) NULL ) { #define GRID_ALLOC_ALIGN (2*1024*1024)
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else #else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif #endif
assert( ptr != (_Tp *)NULL); // std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
//////////////////////////////////////////////////
// First touch optimise in threaded loop // First touch optimise in threaded loop
////////////////////////////////////////////////// uint8_t *cp = (uint8_t *)ptr;
uint64_t *cp = (uint64_t *)ptr; #ifdef GRID_OMP
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page #pragma omp parallel for
cp[n]=0;
});
#endif #endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr; return ptr;
} }
@@ -210,40 +183,133 @@ public:
profilerFree(bytes); profilerFree(bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme); if ( __freeme ) _mm_free((void *)__freeme);
#else #else
if ( __freeme ) free((void *)__freeme); if ( __freeme ) free((void *)__freeme);
#endif #endif
#endif
} }
void construct(pointer __p, const _Tp& __val) { };
// FIXME: hack for the copy constructor, eventually it must be avoided
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,alignedAllocator<T> >; template<class T> using commVector = std::vector<T,commAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
NAMESPACE_END(Grid); }; // namespace Grid
#endif #endif

View File

@@ -30,15 +30,16 @@
#ifndef GRID_CARTESIAN_BASE_H #ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H #define GRID_CARTESIAN_BASE_H
NAMESPACE_BEGIN(Grid);
namespace Grid{
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid // Commicator provides information on the processor grid
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// unsigned long _ndimension; // unsigned long _ndimension;
// Coordinate _processors; // processor grid // std::vector<int> _processors; // processor grid
// int _processor; // linear processor rank // int _processor; // linear processor rank
// Coordinate _processor_coor; // linear processor rank // std::vector<int> _processor_coor; // linear processor rank
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread { class GridBase : public CartesianCommunicator , public GridThread {
@@ -47,40 +48,38 @@ public:
// Give Lattice access // Give Lattice access
template<class object> friend class Lattice; template<class object> friend class Lattice;
GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) { LocallyPeriodic=0;}; GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const std::vector<int> & processor_grid,
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent, const CartesianCommunicator &parent,
int &split_rank) int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {LocallyPeriodic=0;}; : CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const std::vector<int> & processor_grid,
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent) const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {LocallyPeriodic=0;}; : CartesianCommunicator(processor_grid,parent,dummy) {};
virtual ~GridBase() = default; virtual ~GridBase() = default;
// Physics Grid information. // Physics Grid information.
Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes. std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal
Coordinate _gdimensions;// Global dimensions of array after cb removal std::vector<int> _gdimensions;// Global dimensions of array after cb removal
Coordinate _ldimensions;// local dimensions of array with processor images removed std::vector<int> _ldimensions;// local dimensions of array with processor images removed
Coordinate _rdimensions;// Reduced local dimensions with simd lane images and processor images removed std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
Coordinate _ostride; // Outer stride for each dimension std::vector<int> _ostride; // Outer stride for each dimension
Coordinate _istride; // Inner stride i.e. within simd lane std::vector<int> _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions). int _osites; // _isites*_osites = product(dimensions).
int _isites; int _isites;
int _fsites; // _isites*_osites = product(dimensions). int _fsites; // _isites*_osites = product(dimensions).
int _gsites; int _gsites;
Coordinate _slice_block;// subslice information std::vector<int> _slice_block;// subslice information
Coordinate _slice_stride; std::vector<int> _slice_stride;
Coordinate _slice_nblock; std::vector<int> _slice_nblock;
Coordinate _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d] std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic;
public: public:
@@ -89,7 +88,7 @@ public:
// GridCartesian / GridRedBlackCartesian // GridCartesian / GridRedBlackCartesian
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0; virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const Coordinate &site)=0; virtual int CheckerBoard(const std::vector<int> &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
@@ -108,20 +107,20 @@ public:
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously. // lanes are operated upon simultaneously.
virtual int oIndex(Coordinate &coor) virtual int oIndex(std::vector<int> &coor)
{ {
int idx=0; int idx=0;
// Works with either global or local coordinates // Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx; return idx;
} }
virtual int iIndex(Coordinate &lcoor) virtual int iIndex(std::vector<int> &lcoor)
{ {
int idx=0; int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx; return idx;
} }
inline int oIndexReduced(Coordinate &ocoor) inline int oIndexReduced(std::vector<int> &ocoor)
{ {
int idx=0; int idx=0;
// ocoor is already reduced so can eliminate the modulo operation // ocoor is already reduced so can eliminate the modulo operation
@@ -129,11 +128,11 @@ public:
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx; return idx;
} }
inline void oCoorFromOindex (Coordinate& coor,int Oindex){ inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
} }
inline void InOutCoorToLocalCoor (Coordinate &ocoor, Coordinate &icoor, Coordinate &lcoor) { inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) {
lcoor.resize(_ndimension); lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
@@ -142,7 +141,7 @@ public:
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// SIMD lane addressing // SIMD lane addressing
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
inline void iCoorFromIindex(Coordinate &coor,int lane) inline void iCoorFromIindex(std::vector<int> &coor,int lane)
{ {
Lexicographic::CoorFromIndex(coor,lane,_simd_layout); Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
} }
@@ -153,6 +152,8 @@ public:
inline int PermuteType(int dimension){ inline int PermuteType(int dimension){
int permute_type=0; int permute_type=0;
// //
// FIXME:
//
// Best way to encode this would be to present a mask // Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation // for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just // size. If there is only one simd dimension rotated, this is just
@@ -185,11 +186,11 @@ public:
inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;}; inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; }; inline const std::vector<int> LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;}; inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;}; inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;}; inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;};
inline const Coordinate &VirtualLocalDimensions(void) { return _ldimensions;}; inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details // Utility to print the full decomposition details
@@ -213,15 +214,15 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Global addressing // Global addressing
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){ void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
assert(gidx< gSites()); assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){ void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
assert(lidx<lSites()); assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
} }
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){ void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
gidx=0; gidx=0;
int mult=1; int mult=1;
for(int mu=0;mu<_ndimension;mu++) { for(int mu=0;mu<_ndimension;mu++) {
@@ -229,7 +230,7 @@ public:
mult*=_gdimensions[mu]; mult*=_gdimensions[mu];
} }
} }
void GlobalCoorToProcessorCoorLocalCoor(Coordinate &pcoor,Coordinate &lcoor,const Coordinate &gcoor) void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
{ {
pcoor.resize(_ndimension); pcoor.resize(_ndimension);
lcoor.resize(_ndimension); lcoor.resize(_ndimension);
@@ -239,14 +240,14 @@ public:
lcoor[mu] = gcoor[mu]%_fld; lcoor[mu] = gcoor[mu]%_fld;
} }
} }
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const Coordinate &gcoor) void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
{ {
Coordinate pcoor; std::vector<int> pcoor;
Coordinate lcoor; std::vector<int> lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor); rank = RankFromProcessorCoor(pcoor);
/* /*
Coordinate cblcoor(lcoor); std::vector<int> cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){ for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) { if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2; cblcoor[d] = lcoor[d]/2;
@@ -257,10 +258,10 @@ public:
o_idx= oIndex(lcoor); o_idx= oIndex(lcoor);
} }
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , Coordinate &gcoor) void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
{ {
gcoor.resize(_ndimension); gcoor.resize(_ndimension);
Coordinate coor(_ndimension); std::vector<int> coor(_ndimension);
ProcessorCoorFromRank(rank,coor); ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
@@ -272,19 +273,20 @@ public:
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
} }
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,Coordinate &fcoor) void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
{ {
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){ if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb; fcoor[0] = fcoor[0]*2+cb;
} }
} }
void ProcessorCoorLocalCoorToGlobalCoor(Coordinate &Pcoor,Coordinate &Lcoor,Coordinate &gcoor) void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
{ {
gcoor.resize(_ndimension); gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
} }
}; };
NAMESPACE_END(Grid);
}
#endif #endif

View File

@@ -28,12 +28,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CARTESIAN_FULL_H #ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H #define GRID_CARTESIAN_FULL_H
NAMESPACE_BEGIN(Grid); namespace Grid{
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Grid Support. // Grid Support.
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase { class GridCartesian: public GridBase {
public: public:
@@ -48,7 +49,7 @@ public:
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
return 0; return 0;
} }
virtual int CheckerBoard(const Coordinate &site){ virtual int CheckerBoard(const std::vector<int> &site){
return 0; return 0;
} }
virtual int CheckerBoardDestination(int cb,int shift,int dim){ virtual int CheckerBoardDestination(int cb,int shift,int dim){
@@ -63,16 +64,16 @@ public:
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator. // Constructor takes a parent grid and possibly subdivides communicator.
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions, GridCartesian(const std::vector<int> &dimensions,
const Coordinate &simd_layout, const std::vector<int> &simd_layout,
const Coordinate &processor_grid, const std::vector<int> &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
} }
GridCartesian(const Coordinate &dimensions, GridCartesian(const std::vector<int> &dimensions,
const Coordinate &simd_layout, const std::vector<int> &simd_layout,
const Coordinate &processor_grid, const std::vector<int> &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
@@ -80,18 +81,18 @@ public:
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Construct from comm world // Construct from comm world
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions, GridCartesian(const std::vector<int> &dimensions,
const Coordinate &simd_layout, const std::vector<int> &simd_layout,
const Coordinate &processor_grid) : GridBase(processor_grid) const std::vector<int> &processor_grid) : GridBase(processor_grid)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
} }
virtual ~GridCartesian() = default; virtual ~GridCartesian() = default;
void Init(const Coordinate &dimensions, void Init(const std::vector<int> &dimensions,
const Coordinate &simd_layout, const std::vector<int> &simd_layout,
const Coordinate &processor_grid) const std::vector<int> &processor_grid)
{ {
/////////////////////// ///////////////////////
// Grid information // Grid information
@@ -169,6 +170,5 @@ public:
}; };
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -29,7 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CARTESIAN_RED_BLACK_H #ifndef GRID_CARTESIAN_RED_BLACK_H
#define GRID_CARTESIAN_RED_BLACK_H #define GRID_CARTESIAN_RED_BLACK_H
NAMESPACE_BEGIN(Grid);
namespace Grid {
static const int CbRed =0; static const int CbRed =0;
static const int CbBlack=1; static const int CbBlack=1;
@@ -40,7 +41,7 @@ static const int Odd =CbBlack;
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
Coordinate _checker_dim_mask; std::vector<int> _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
@@ -48,7 +49,7 @@ public:
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(const Coordinate &site){ virtual int CheckerBoard(const std::vector<int> &site){
int linear=0; int linear=0;
assert(site.size()==_ndimension); assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
@@ -58,6 +59,7 @@ public:
return (linear&0x1); return (linear&0x1);
} }
// Depending on the cb of site, we toggle source cb. // Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e) // for block #b, element #e = (b, e)
// we need // we need
@@ -81,7 +83,7 @@ public:
} }
virtual int CheckerBoardFromOindex (int Oindex) virtual int CheckerBoardFromOindex (int Oindex)
{ {
Coordinate ocoor; std::vector<int> ocoor;
oCoorFromOindex(ocoor,Oindex); oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor); return CheckerBoard(ocoor);
} }
@@ -116,7 +118,7 @@ public:
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base) GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{ {
int dims = base->_ndimension; int dims = base->_ndimension;
Coordinate checker_dim_mask(dims,1); std::vector<int> checker_dim_mask(dims,1);
int checker_dim = 0; int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim); Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
}; };
@@ -125,7 +127,7 @@ public:
// Create redblack from original grid, with non-trivial checker dim mask // Create redblack from original grid, with non-trivial checker dim mask
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base, GridRedBlackCartesian(const GridBase *base,
const Coordinate &checker_dim_mask, const std::vector<int> &checker_dim_mask,
int checker_dim int checker_dim
) : GridBase(base->_processors,*base) ) : GridBase(base->_processors,*base)
{ {
@@ -133,11 +135,40 @@ public:
} }
virtual ~GridRedBlackCartesian() = default; virtual ~GridRedBlackCartesian() = default;
#if 0
////////////////////////////////////////////////////////////
// Create redblack grid ;; deprecate these. Should not
// need direct creation of redblack without a full grid to base on
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(processor_grid,*base)
{
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
void Init(const Coordinate &dimensions, ////////////////////////////////////////////////////////////
const Coordinate &simd_layout, // Create redblack grid
const Coordinate &processor_grid, ////////////////////////////////////////////////////////////
const Coordinate &checker_dim_mask, GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid,*base)
{
std::vector<int> checker_dim_mask(dimensions.size(),1);
int checker_dim = 0;
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
#endif
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim) int checker_dim)
{ {
@@ -251,7 +282,7 @@ public:
}; };
protected: protected:
virtual int oIndex(Coordinate &coor) virtual int oIndex(std::vector<int> &coor)
{ {
int idx = 0; int idx = 0;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
@@ -268,7 +299,7 @@ protected:
return idx; return idx;
}; };
virtual int iIndex(Coordinate &lcoor) virtual int iIndex(std::vector<int> &lcoor)
{ {
int idx = 0; int idx = 0;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
@@ -285,5 +316,5 @@ protected:
return idx; return idx;
} }
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_COMMUNICATOR_H #ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H #define GRID_COMMUNICATOR_H
#include <Grid/util/Coordinate.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h> #include <Grid/communicator/Communicator_base.h>

View File

@@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <limits.h> #include <limits.h>
#include <sys/mman.h> #include <sys/mman.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
@@ -47,8 +47,8 @@ int CartesianCommunicator::Dimensions(void) { return
int CartesianCommunicator::IsBoss(void) { return _processor==0; }; int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; }; int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; }; int CartesianCommunicator::ThisRank(void) { return _processor; };
const Coordinate & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const Coordinate & CartesianCommunicator::ProcessorGrid(void) { return _processors; }; const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; }; int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@@ -72,6 +72,5 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);
} }
NAMESPACE_END(Grid); }

View File

@@ -34,7 +34,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
class CartesianCommunicator : public SharedMemory { class CartesianCommunicator : public SharedMemory {
@@ -52,9 +52,9 @@ public:
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
//////////////////////////////////////////// ////////////////////////////////////////////
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
Coordinate _processors; // Which dimensions get relayed out over processors lanes. std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank int _processor; // linear processor rank
Coordinate _processor_coor; // linear processor coordinate std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension; unsigned long _ndimension;
static Grid_MPI_Comm communicator_world; static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator; Grid_MPI_Comm communicator;
@@ -69,8 +69,8 @@ public:
// Constructors to sub-divide a parent communicator // Constructors to sub-divide a parent communicator
// and default to comm world // and default to comm world
//////////////////////////////////////////////// ////////////////////////////////////////////////
CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank); CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const Coordinate &pdimensions_in); CartesianCommunicator(const std::vector<int> &pdimensions_in);
virtual ~CartesianCommunicator(); virtual ~CartesianCommunicator();
private: private:
@@ -79,7 +79,7 @@ private:
// Private initialise from an MPI communicator // Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private // Can use after an MPI_Comm_split, but hidden from user so private
//////////////////////////////////////////////// ////////////////////////////////////////////////
void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base); void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
public: public:
@@ -88,15 +88,15 @@ public:
// Wraps MPI_Cart routines, or implements equivalent on other impls // Wraps MPI_Cart routines, or implements equivalent on other impls
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest); void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(Coordinate &coor); int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,Coordinate &coor); void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
int Dimensions(void) ; int Dimensions(void) ;
int IsBoss(void) ; int IsBoss(void) ;
int BossRank(void) ; int BossRank(void) ;
int ThisRank(void) ; int ThisRank(void) ;
const Coordinate & ThisProcessorCoor(void) ; const std::vector<int> & ThisProcessorCoor(void) ;
const Coordinate & ProcessorGrid(void) ; const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ; int ProcessorCount(void) ;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@@ -199,10 +199,9 @@ public:
template<class obj> void Broadcast(int root,obj &data) template<class obj> void Broadcast(int root,obj &data)
{ {
Broadcast(root,(void *)&data,sizeof(data)); Broadcast(root,(void *)&data,sizeof(data));
}
}; };
NAMESPACE_END(Grid); };
}
#endif #endif

View File

@@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
@@ -44,26 +44,21 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
assert(0); assert(0);
} }
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { Grid_quiesce_nodes();
assert(0);
}
}
// Never clean up as done once. // Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
Grid_quiesce_nodes();
GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate( GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES, GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages); GlobalSharedMemory::Hugepages);
Grid_unquiesce_nodes();
} }
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
@@ -74,14 +69,14 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{ {
int rank; int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0); assert(ierr==0);
return rank; return rank;
} }
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{ {
coor.resize(_ndimension); coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
@@ -91,7 +86,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world // Initialises from communicator_world
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
MPI_Comm optimal_comm; MPI_Comm optimal_comm;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
@@ -110,12 +105,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
////////////////////////////////// //////////////////////////////////
// Try to subdivide communicator // Try to subdivide communicator
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size();
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0); std::vector<int> parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1); std::vector<int> parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc... // Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension; int pad = _ndimension-parent_ndimension;
@@ -128,8 +124,10 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
// split the communicator // split the communicator
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// int Nparent = parent._processors ; // int Nparent = parent._processors ;
// std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
int Nparent; int Nparent;
MPI_Comm_size(parent.communicator,&Nparent); MPI_Comm_size(parent.communicator,&Nparent);
// std::cout << " Parent size "<<Nparent <<std::endl;
int childsize=1; int childsize=1;
for(int d=0;d<processors.size();d++) { for(int d=0;d<processors.size();d++) {
@@ -138,9 +136,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
int Nchild = Nparent/childsize; int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent); assert (childsize * Nchild == Nparent);
Coordinate ccoor(_ndimension); // coor within subcommunicator // std::cout << " child size "<<childsize <<std::endl;
Coordinate scoor(_ndimension); // coor of split within parent
Coordinate ssize(_ndimension); // coor of split within parent std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d]; ccoor[d] = parent_processor_coor[d] % processors[d];
@@ -157,6 +157,36 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
MPI_Comm comm_split; MPI_Comm comm_split;
if ( Nchild > 1 ) { if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Split the communicator // Split the communicator
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -195,7 +225,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
} }
} }
void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base) void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
{ {
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo // Creates communicator, and the communicator_halo
@@ -212,7 +242,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
_Nprocessors*=_processors[i]; _Nprocessors*=_processors[i];
} }
Coordinate periodic(_ndimension,1); std::vector<int> periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator); MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor); MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@@ -449,7 +479,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
Coordinate row(_ndimension,1); std::vector<int> row(_ndimension,1);
assert(dim>=0 && dim<_ndimension); assert(dim>=0 && dim<_ndimension);
// Split the communicator // Split the communicator
@@ -478,6 +508,7 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
MPI_Type_free(&object); MPI_Type_free(&object);
} }
NAMESPACE_END(Grid);
}

View File

@@ -27,7 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
@@ -42,17 +42,17 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
GlobalSharedMemory::Hugepages); GlobalSharedMemory::Hugepages);
} }
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors) : CartesianCommunicator(processors)
{ {
srank=0; srank=0;
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
} }
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_processors = processors; _processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size();
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake // Require 1^N processor grid for fake
@@ -122,8 +122,8 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
source =0; source =0;
@@ -160,6 +160,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};
NAMESPACE_END(Grid);
}

View File

@@ -28,11 +28,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
// static data // static data
int GlobalSharedMemory::HPEhypercube = 1;
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL; uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0; int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup; int GlobalSharedMemory::_ShmSetup;
@@ -77,7 +76,6 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl; std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr; return ptr;
} }
void SharedMemory::ShmBufferFreeAll(void) { void SharedMemory::ShmBufferFreeAll(void) {
@@ -86,9 +84,9 @@ void SharedMemory::ShmBufferFreeAll(void) {
} }
void *SharedMemory::ShmBufferSelf(void) void *SharedMemory::ShmBufferSelf(void)
{ {
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank]; return ShmCommBufs[ShmRank];
} }
NAMESPACE_END(Grid);
}

View File

@@ -25,6 +25,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once #pragma once
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
@@ -41,8 +53,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <sys/shm.h> #include <sys/shm.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <zlib.h> #include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
NAMESPACE_BEGIN(Grid); namespace Grid {
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
@@ -56,18 +71,12 @@ class GlobalSharedMemory {
private: private:
static const int MAXLOG2RANKSPERNODE = 16; static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation // Init once lock on the buffer allocation
static int _ShmSetup; static int _ShmSetup;
static int _ShmAlloc; static int _ShmAlloc;
static uint64_t _ShmAllocBytes; static uint64_t _ShmAllocBytes;
public: public:
///////////////////////////////////////
// HPE 8600 hypercube optimisation
///////////////////////////////////////
static int HPEhypercube;
static int ShmSetup(void) { return _ShmSetup; } static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; } static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; } static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
@@ -93,17 +102,12 @@ public:
// Create an optimal reordered communicator that makes MPI_Cart_create get it right // Create an optimal reordered communicator that makes MPI_Cart_create get it right
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Provide shared memory facilities off comm world // Provide shared memory facilities off comm world
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
}; };
@@ -144,7 +148,6 @@ public:
// Call on any instance // Call on any instance
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
void SharedMemoryTest(void); void SharedMemoryTest(void);
void *ShmBufferSelf(void); void *ShmBufferSelf(void);
void *ShmBuffer (int rank); void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p); void *ShmBufferTranslate(int rank,void * local_p);
@@ -159,5 +162,4 @@ public:
}; };
NAMESPACE_END(Grid); }

View File

@@ -29,12 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
#ifdef GRID_NVCC namespace Grid {
#include <cuda_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
@@ -50,11 +46,6 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank // WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes // WorldNodes
@@ -139,53 +130,9 @@ int Log2Size(int TwoToPower,int MAXLOG2)
} }
return log2size; return log2size;
} }
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
//////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions
//////////////////////////////////////////////////////////////////////////////
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
int R;
int I;
int N;
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
break;
}
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
#ifdef HYPERCUBE
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -226,9 +173,9 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
} }
std::string hname(name); std::string hname(name);
// std::cout << "hostname "<<hname<<std::endl; std::cout << "hostname "<<hname<<std::endl;
// std::cout << "R " << R << " I " << I << " N "<< N std::cout << "R " << R << " I " << I << " N "<< N
// << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl; << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition. // broadcast node 0's base coordinate for this partition.
@@ -250,13 +197,16 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
Coordinate processor_coor(ndimension); std::vector<int> processor_coor(ndimension);
Coordinate WorldDims = processors; std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
Coordinate ShmDims (ndimension); Coordinate NodeDims (ndimension); std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
Coordinate ShmCoor (ndimension); Coordinate NodeCoor (ndimension); Coordinate WorldCoor(ndimension); std::vector<int> HyperCoor(ndimension);
Coordinate HyperCoor(ndimension); int dim = 0;
for(int l2=0;l2<log2size;l2++){
GetShmDims(WorldDims,ShmDims); while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
@@ -303,19 +253,28 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); assert(ierr==0);
} #else
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) ////////////////////////////////////////////////////////////////
{ // Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims // Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
Coordinate processor_coor(ndimension); std::vector<int> processor_coor(ndimension);
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension); Coordinate NodeDims (ndimension); std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension); std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
GetShmDims(WorldDims,ShmDims);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -347,6 +306,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); assert(ierr==0);
#endif
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET // SHMGET
@@ -354,7 +314,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
#ifdef GRID_MPI3_SHMGET #ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
@@ -377,7 +337,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
int errsv = errno; int errsv = errno;
printf("Errno %d\n",errsv); printf("Errno %d\n",errsv);
printf("key %d\n",key); printf("key %d\n",key);
printf("size %ld\n",size); printf("size %lld\n",size);
printf("flags %d\n",flags); printf("flags %d\n",flags);
perror("shmget"); perror("shmget");
exit(1); exit(1);
@@ -413,104 +373,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.
// The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.
// e.g. DGX1, supermicro board,
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto err = cudaMalloc(&ShmCommBuf, bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
r,
WorldShmComm);
assert(ierr==0);
}
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -547,7 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
@@ -557,7 +423,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE #ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -589,7 +455,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
@@ -604,7 +470,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);
@@ -633,7 +499,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif #endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl; std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) { if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
@@ -670,27 +536,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
} }
#endif #endif
#endif // End NVCC case for GPU device buffers
/////////////////////////////////////////////////////////////////////////
// Routines accessing shared memory should route through for GPU safety
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemset(dest,0,bytes);
#else
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else
bcopy(src,dest,bytes);
#endif
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@@ -722,6 +571,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@@ -734,26 +584,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between sockets
// if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){
int hisRank=ShmRanks[r];
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
}
}
#endif
SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier
@@ -768,26 +598,24 @@ void SharedMemory::ShmBarrier(void)
void SharedMemory::SharedMemoryTest(void) void SharedMemory::SharedMemoryTest(void)
{ {
ShmBarrier(); ShmBarrier();
uint64_t check[3];
uint64_t magic = 0x5A5A5A;
if ( ShmRank == 0 ) { if ( ShmRank == 0 ) {
for(uint64_t r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GlobalSharedMemory::WorldNode; check[0] = GlobalSharedMemory::WorldNode;
check[1] = r; check[1] = r;
check[2]=magic; check[2] = 0x5A5A5A;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
ShmBarrier(); uint64_t * check = (uint64_t *) ShmCommBufs[r];
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode); assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r); assert(check[1]==r);
assert(check[2]==magic); assert(check[2]==0x5A5A5A);
ShmBarrier();
} }
ShmBarrier();
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)
@@ -801,6 +629,7 @@ void *SharedMemory::ShmBuffer(int rank)
} }
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
static int count =0;
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
@@ -819,5 +648,4 @@ SharedMemory::~SharedMemory()
} }
}; };
NAMESPACE_END(Grid); }

View File

@@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -47,7 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1; _ShmSetup=1;
} }
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{ {
optimal_comm = WorldComm; optimal_comm = WorldComm;
} }
@@ -125,5 +125,4 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
SharedMemory::~SharedMemory() SharedMemory::~SharedMemory()
{}; {};
NAMESPACE_END(Grid); }

View File

@@ -25,9 +25,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef _GRID_CSHIFT_COMMON_H_
#define _GRID_CSHIFT_COMMON_H_
NAMESPACE_BEGIN(Grid); namespace Grid {
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
@@ -35,21 +36,20 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int ent = 0; int ent = 0;
static Vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View(); int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@@ -63,68 +63,66 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b); table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
thread_for(i,ent,{ parallel_for(int i=0;i<ent;i++){
buffer[table[i].first]=rhs_v[table[i].second]; buffer[table[i].first]=rhs._odata[table[i].second];
}); }
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split // Gather for when there *is* need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_extract(const Lattice<vobj> &rhs, Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
ExtractPointerArray<typename vobj::scalar_object> pointers,
int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs._grid->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
thread_for_collapse(2,n,e1,{ parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs_v[so+o+b]; vobj temp =rhs._odata[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}
} }
});
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl; std::cout << " Dense packed buffer WARNING " <<std::endl;
thread_for_collapse(2,n,e1,{ parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o=n*n1; int o=n*n1;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b]; vobj temp =rhs._odata[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
} }
}); }
} }
} }
@@ -133,17 +131,17 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0; int ent =0;
@@ -152,8 +150,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension]; int bo =n*rhs._grid->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b); table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@@ -162,8 +160,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int bo=0; int bo=0;
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++); table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
@@ -171,51 +169,48 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
} }
} }
auto rhs_v = rhs.View(); parallel_for(int i=0;i<ent;i++){
thread_for(i,ent,{ rhs._odata[table[i].first]=buffer[table[i].second];
rhs_v[table[i].first]=buffer[table[i].second]; }
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split // Scatter for when there *is* need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
auto rhs_v = rhs.View(); parallel_for_nest2(int n=0;n<e1;n++){
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension];
merge(rhs_v[so+o+b],pointers,offset); merge(rhs._odata[so+o+b],pointers,offset);
}
} }
});
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View();
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension]; int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
merge(rhs_v[so+o+b],pointers,offset); merge(rhs._odata[so+o+b],pointers,offset);
} }
} }
} }
@@ -227,18 +222,18 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0; int ent=0;
@@ -253,7 +248,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o); int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o); table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
@@ -261,33 +256,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
} }
} }
auto rhs_v = rhs.View(); parallel_for(int i=0;i<ent;i++){
auto lhs_v = lhs.View(); lhs._odata[table[i].first]=rhs._odata[table[i].second];
thread_for(i,ent,{ }
lhs_v[table[i].first]=rhs_v[table[i].second];
});
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs._grid->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block [dimension]; int e2=rhs._grid->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0; int ent=0;
double t_tab,t_perm;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@@ -298,16 +292,14 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
auto rhs_v = rhs.View(); parallel_for(int i=0;i<ent;i++){
auto lhs_v = lhs.View(); permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
thread_for(i,ent,{ }
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@@ -317,8 +309,10 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
double t_local;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3); Cshift_local(ret,rhs,dimension,shift,0x3);
@@ -330,7 +324,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
GridBase *grid = rhs.Grid(); GridBase *grid = rhs._grid;
int fd = grid->_fdimensions[dimension]; int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension]; int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension]; int ld = grid->_ldimensions[dimension];
@@ -341,18 +335,18 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
// the permute type // the permute type
ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
int permute_dim =grid->PermuteDim(dimension); int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension); int permute_type=grid->PermuteType(dimension);
int permute_type_dist; int permute_type_dist;
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
// int o = 0; int o = 0;
int bo = x * grid->_ostride[dimension]; int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
// wrap is whether sshift > rd. // wrap is whether sshift > rd.
@@ -393,5 +387,5 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
} }
} }
NAMESPACE_END(Grid); }
#endif

View File

@@ -30,27 +30,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs.Grid()); Lattice<vobj> ret(rhs._grid);
int fd = rhs.Grid()->_fdimensions[dimension]; int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension. // Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
// the permute type // the permute type
int simd_layout = rhs.Grid()->_simd_layout[dimension]; int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int comm_dim = rhs._grid->_processors[dimension] >1 ;
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
if ( !comm_dim ) { if ( !comm_dim ) {
@@ -70,10 +70,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3); Cshift_comms(ret,rhs,dimension,shift,0x3);
@@ -88,8 +88,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
@@ -107,25 +107,25 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid(); GridBase *grid=rhs._grid;
Lattice<vobj> temp(rhs.Grid()); Lattice<vobj> temp(rhs._grid);
int fd = rhs.Grid()->_fdimensions[dimension]; int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension]; int pd = rhs._grid->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension]; int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int comm_dim = rhs._grid->_processors[dimension] >1 ;
assert(simd_layout==1); assert(simd_layout==1);
assert(comm_dim==1); assert(comm_dim==1);
assert(shift>=0); assert(shift>=0);
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size); commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size); commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
@@ -145,7 +145,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
// int rank = grid->_processor; int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
@@ -165,7 +165,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
GridBase *grid=rhs.Grid(); GridBase *grid=rhs._grid;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@@ -193,21 +193,21 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// Simd direction uses an extract/merge pair // Simd direction uses an extract/merge pair
/////////////////////////////////////////////// ///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); // std::vector<scalar_object *> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers std::vector<scalar_object *> rpointers(Nsimd); // received pointers
/////////////////////////////////////////// ///////////////////////////////////////////
// Work out what to send where // Work out what to send where
/////////////////////////////////////////// ///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even; int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
// loop over outer coord planes orthog to dim // loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
@@ -258,7 +258,5 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
} }
} }
}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -27,14 +27,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_CSHIFT_NONE_H_ #ifndef _GRID_CSHIFT_NONE_H_
#define _GRID_CSHIFT_NONE_H_ #define _GRID_CSHIFT_NONE_H_
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
Lattice<vobj> ret(rhs.Grid()); Lattice<vobj> ret(rhs._grid);
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
Cshift_local(ret,rhs,dimension,shift); Cshift_local(ret,rhs,dimension,shift);
return ret; return ret;
} }
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -1,4 +1,3 @@
#ifndef __NVCC__
/* /*
__ _____ _____ _____ __ _____ _____ _____
__| | __| | | | JSON for Modern C++ __| | __| | | | JSON for Modern C++
@@ -18919,4 +18918,3 @@ inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std
#endif #endif
#endif

View File

@@ -25,22 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_LATTICE_H
#include <Grid/lattice/Lattice_base.h> #define GRID_LATTICE_H
#include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h>
#include <Grid/lattice/Lattice_arith.h>
#include <Grid/lattice/Lattice_trace.h>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h>
//#include <Grid/lattice/Lattice_where.h>
#include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_base.h>
#endif

View File

@@ -36,13 +36,13 @@ directory
#include <typeinfo> #include <typeinfo>
#include <vector> #include <vector>
NAMESPACE_BEGIN(Grid); namespace Grid {
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const robj &iffalse) { const robj &iffalse) {
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
@@ -51,10 +51,11 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd(); const int Nsimd = vobj::vector_type::Nsimd();
const int words = sizeof(vobj) / sizeof(vector_type);
ExtractBuffer<Integer> mask(Nsimd); std::vector<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd); std::vector<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd); std::vector<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals); extract(iftrue, truevals);
extract(iffalse, falsevals); extract(iffalse, falsevals);
@@ -68,139 +69,149 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
return ret; return ret;
} }
///////////////////////////////////////////////////// ////////////////////////////////////////////
// recursive evaluation of expressions; Could
// switch to generic approach with variadics, a la
// Antonin's Lat Sim but the repack to variadic with popped
// from tuple is hideous; C++14 introduces std::make_index_sequence for this
////////////////////////////////////////////
// leaf eval of lattice ; should enable if protect using traits
template <typename T>
using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T>
using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
//Specialization of getVectorType for lattices //Specialization of getVectorType for lattices
/////////////////////////////////////////////////////
template<typename T> template<typename T>
struct getVectorType<Lattice<T> >{ struct getVectorType<Lattice<T> >{
typedef typename Lattice<T>::vector_object type; typedef typename Lattice<T>::vector_object type;
}; };
//////////////////////////////////////////// template<class sobj>
//-- recursive evaluation of expressions; -- inline sobj eval(const unsigned int ss, const sobj &arg)
// handle leaves of syntax tree
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj>
template <class lobj> accelerator_inline inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg) return arg._odata[ss];
{
return arg[ss];
}
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.View();
return view[ss];
} }
/////////////////////////////////////////////////// // handle nodes in syntax tree
// handle nodes in syntax tree- eval one operand template <typename Op, typename T1>
/////////////////////////////////////////////////// auto inline eval(
template <typename Op, typename T1> accelerator_inline const unsigned int ss,
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr) const LatticeUnaryExpression<Op, T1> &expr) // eval one operand
-> decltype(expr.op.func( eval(ss, expr.arg1))) -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
{ return expr.first.func(eval(ss, std::get<0>(expr.second)));
return expr.op.func( eval(ss, expr.arg1) );
} }
///////////////////////
// eval two operands template <typename Op, typename T1, typename T2>
/////////////////////// auto inline eval(
template <typename Op, typename T1, typename T2> accelerator_inline const unsigned int ss,
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr) const LatticeBinaryExpression<Op, T1, T2> &expr) // eval two operands
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
{ eval(ss, std::get<1>(expr.second)))) {
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)));
} }
///////////////////////
// eval three operands template <typename Op, typename T1, typename T2, typename T3>
/////////////////////// auto inline eval(const unsigned int ss,
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline const LatticeTrinaryExpression<Op, T1, T2, T3>
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) &expr) // eval three operands
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3))) -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
{ eval(ss, std::get<1>(expr.second)),
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)); eval(ss, std::get<2>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)));
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the grid from an expression, ensuring conformable. This must follow a // Obtain the grid from an expression, ensuring conformable. This must follow a
// tree recursion; must retain grid pointer in the LatticeView class which sucks // tree recursion
// Use a different method, and make it void *.
// Perhaps a conformable method.
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,
accelerator_inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
{ {
lat.Conformable(grid); if (grid) {
conformable(grid, lat._grid);
} }
grid = lat._grid;
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> }
accelerator_inline template <class T1,
void GridFromExpression(GridBase *&grid,const T1 &notlat) // non-lattice leaf typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid,
const T1 &notlat) // non-lattice leaf
{} {}
template <typename Op, typename T1> template <typename Op, typename T1>
accelerator_inline inline void GridFromExpression(GridBase *&grid,
void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression<Op, T1> &expr) const LatticeUnaryExpression<Op, T1> &expr) {
{ GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, expr.arg1); // recurse
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1, typename T2>
accelerator_inline inline void GridFromExpression(
void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
{ GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, expr.arg1); // recurse GridFromExpression(grid, std::get<1>(expr.second));
GridFromExpression(grid, expr.arg2);
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
accelerator_inline inline void GridFromExpression(
void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
{ GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, expr.arg1); // recurse GridFromExpression(grid, std::get<1>(expr.second));
GridFromExpression(grid, expr.arg2); // recurse GridFromExpression(grid, std::get<2>(expr.second));
GridFromExpression(grid, expr.arg3); // recurse
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the CB from an expression, ensuring conformable. This must follow a // Obtain the CB from an expression, ensuring conformable. This must follow a
// tree recursion // tree recursion
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.Checkerboard()); assert(cb == lat.checkerboard);
} }
cb = lat.Checkerboard(); cb = lat.checkerboard;
// std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{ {
// std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
}
template <typename Op, typename T1>
inline void CBFromExpression(int &cb,
const LatticeUnaryExpression<Op, T1> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
// std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
} }
template <typename Op, typename T1> inline template <typename Op, typename T1, typename T2>
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) inline void CBFromExpression(int &cb,
{ const LatticeBinaryExpression<Op, T1, T2> &expr) {
CBFromExpression(cb, expr.arg1); // recurse AST CBFromExpression(cb, std::get<0>(expr.second)); // recurse
} CBFromExpression(cb, std::get<1>(expr.second));
// std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, expr.arg2); // recurse AST
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) inline void CBFromExpression(
{ int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
CBFromExpression(cb, expr.arg1); // recurse AST CBFromExpression(cb, std::get<0>(expr.second)); // recurse
CBFromExpression(cb, expr.arg2); // recurse AST CBFromExpression(cb, std::get<1>(expr.second));
CBFromExpression(cb, expr.arg3); // recurse AST CBFromExpression(cb, std::get<2>(expr.second));
// std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
} }
//////////////////////////////////////////// ////////////////////////////////////////////
@@ -209,7 +220,7 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> \ template <class arg> \
struct name { \ struct name { \
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \ static auto inline func(const arg a) -> decltype(ret) { return ret; } \
}; };
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
@@ -242,18 +253,16 @@ GridUnopClass(UnaryExp, exp(a));
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \ template <class left, class right> \
struct name { \ struct name { \
static auto accelerator_inline \ static auto inline func(const left &lhs, const right &rhs) \
func(const left &lhs, const right &rhs) \ -> decltype(combination) const { \
-> decltype(combination) const \
{ \
return combination; \ return combination; \
} \ } \
}; }
GridBinOpClass(BinaryAdd, lhs + rhs); GridBinOpClass(BinaryAdd, lhs + rhs);
GridBinOpClass(BinarySub, lhs - rhs); GridBinOpClass(BinarySub, lhs - rhs);
GridBinOpClass(BinaryMul, lhs *rhs); GridBinOpClass(BinaryMul, lhs *rhs);
GridBinOpClass(BinaryDiv, lhs /rhs); GridBinOpClass(BinaryDiv, lhs /rhs);
GridBinOpClass(BinaryAnd, lhs &rhs); GridBinOpClass(BinaryAnd, lhs &rhs);
GridBinOpClass(BinaryOr, lhs | rhs); GridBinOpClass(BinaryOr, lhs | rhs);
GridBinOpClass(BinaryAndAnd, lhs &&rhs); GridBinOpClass(BinaryAndAnd, lhs &&rhs);
@@ -265,18 +274,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \ template <class predicate, class left, class right> \
struct name { \ struct name { \
static auto accelerator_inline \ static auto inline func(const predicate &pred, const left &lhs, \
func(const predicate &pred, const left &lhs, const right &rhs) \ const right &rhs) -> decltype(combination) const { \
-> decltype(combination) const \
{ \
return combination; \ return combination; \
} \ } \
}; }
GridTrinOpClass(TrinaryWhere, GridTrinOpClass(
(predicatedWhere<predicate, TrinaryWhere,
typename std::remove_reference<left>::type, (predicatedWhere<predicate, typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs,rhs))); typename std::remove_reference<right>::type>(pred, lhs,
rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
@@ -284,32 +292,50 @@ GridTrinOpClass(TrinaryWhere,
#define GRID_UNOP(name) name<decltype(eval(0, arg))> #define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_TRINOP(name) \
name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ template <typename T1, \
inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \ typename std::enable_if<is_lattice<T1>::value || \
{ \ is_lattice_expr<T1>::value, \
return LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \ T1>::type * = nullptr> \
inline auto op(const T1 &arg) \
->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg))); \
} }
#define GRID_BINOP_LEFT(op, name) \ #define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ typename std::enable_if<is_lattice<T1>::value || \
is_lattice_expr<T1>::value, \
T1>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \ inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs)) \ ->decltype( \
{ \ LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs);\ std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_BINOP_RIGHT(op, name) \ #define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value&&!is_lattice_expr<T1>::value,T1>::type * = nullptr, \ typename std::enable_if<!is_lattice<T1>::value && \
typename std::enable_if< is_lattice<T2>::value|| is_lattice_expr<T2>::value,T2>::type * = nullptr> \ !is_lattice_expr<T1>::value, \
T1>::type * = nullptr, \
typename std::enable_if<is_lattice<T2>::value || \
is_lattice_expr<T2>::value, \
T2>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \ inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs)) \ ->decltype( \
{ \ LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs); \ std::make_pair(GRID_BINOP(name)(), \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_DEF_BINOP(op, name) \ #define GRID_DEF_BINOP(op, name) \
@@ -319,14 +345,18 @@ GridTrinOpClass(TrinaryWhere,
#define GRID_DEF_TRINOP(op, name) \ #define GRID_DEF_TRINOP(op, name) \
template <typename T1, typename T2, typename T3> \ template <typename T1, typename T2, typename T3> \
inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \ inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
->decltype(LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs)) \ ->decltype( \
{ \ LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
return LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs); \ const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) { \
return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs))); \
} }
//////////////////////// ////////////////////////
// Operator definitions // Operator definitions
//////////////////////// ////////////////////////
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
@@ -370,27 +400,29 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
{ Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr); expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
{ eval(0, std::get<1>(expr.second))))> {
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr); Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second))))>
ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1), -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, expr.arg2), eval(0, std::get<1>(expr.second)),
eval(0, expr.arg3)))> eval(0, std::get<2>(expr.second))))> {
{ Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
Lattice<decltype(expr.op.func(eval(0, expr.arg1), eval(0, std::get<1>(expr.second)),
eval(0, expr.arg2), eval(0, std::get<2>(expr.second))))>
eval(0, expr.arg3)))> ret(expr); ret(expr);
return ret; return ret;
} }
@@ -401,7 +433,34 @@ auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
#undef GRID_DEF_UNOP #undef GRID_DEF_UNOP
#undef GRID_DEF_BINOP #undef GRID_DEF_BINOP
#undef GRID_DEF_TRINOP #undef GRID_DEF_TRINOP
}
NAMESPACE_END(Grid); #if 0
using namespace Grid;
int main(int argc,char **argv){
Lattice<double> v1(16);
Lattice<double> v2(16);
Lattice<double> v3(16);
BinaryAdd<double,double> tmp;
LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &>
expr(std::make_pair(tmp,
std::forward_as_tuple(v1,v2)));
tmp.func(eval(0,v1),eval(0,v2));
auto var = v1+v2;
std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
v3=v1+v2;
v3=v1+v2+v1*v2;
};
void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
{
v3=v1+v2+v1*v2;
}
#endif
#endif #endif

View File

@@ -28,230 +28,228 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_ARITH_H #ifndef GRID_LATTICE_ARITH_H
#define GRID_LATTICE_ARITH_H #define GRID_LATTICE_ARITH_H
NAMESPACE_BEGIN(Grid); namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add // avoid copy back routines for mult, mac, sub, add
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
decltype(coalescedRead(obj1())) tmp; #ifdef STREAMING_STORES
auto lhs_t = lhs_v(ss); obj1 tmp;
auto rhs_t = rhs_v(ss); mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
mult(&tmp,&lhs_t,&rhs_t); vstream(ret._odata[ss],tmp);
coalescedWrite(ret_v[ss],tmp); #else
}); mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
#endif
}
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); #ifdef STREAMING_STORES
auto rhs_v = rhs.View(); obj1 tmp;
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
decltype(coalescedRead(obj1())) tmp; vstream(ret._odata[ss],tmp);
auto lhs_t=lhs_v(ss); #else
auto rhs_t=rhs_v(ss); mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
mac(&tmp,&lhs_t,&rhs_t); #endif
coalescedWrite(ret_v[ss],tmp); }
});
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); #ifdef STREAMING_STORES
auto rhs_v = rhs.View(); obj1 tmp;
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
decltype(coalescedRead(obj1())) tmp; vstream(ret._odata[ss],tmp);
auto lhs_t=lhs_v(ss); #else
auto rhs_t=rhs_v(ss); sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
sub(&tmp,&lhs_t,&rhs_t); #endif
coalescedWrite(ret_v[ss],tmp);
});
} }
template<class obj1,class obj2,class obj3> inline }
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); #ifdef STREAMING_STORES
auto rhs_v = rhs.View(); obj1 tmp;
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
decltype(coalescedRead(obj1())) tmp; vstream(ret._odata[ss],tmp);
auto lhs_t=lhs_v(ss); #else
auto rhs_t=rhs_v(ss); add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
add(&tmp,&lhs_t,&rhs_t); #endif
coalescedWrite(ret_v[ss],tmp); }
});
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add // avoid copy back routines for mult, mac, sub, add
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); obj1 tmp;
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ mult(&tmp,&lhs._odata[ss],&rhs);
decltype(coalescedRead(obj1())) tmp; vstream(ret._odata[ss],tmp);
mult(&tmp,&lhs_v(ss),&rhs); }
coalescedWrite(ret_v[ss],tmp);
});
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); obj1 tmp;
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ mac(&tmp,&lhs._odata[ss],&rhs);
decltype(coalescedRead(obj1())) tmp; vstream(ret._odata[ss],tmp);
auto lhs_t=lhs_v(ss); }
mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); #ifdef STREAMING_STORES
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ obj1 tmp;
decltype(coalescedRead(obj1())) tmp; sub(&tmp,&lhs._odata[ss],&rhs);
auto lhs_t=lhs_v(ss); vstream(ret._odata[ss],tmp);
sub(&tmp,&lhs_t,&rhs); #else
coalescedWrite(ret_v[ss],tmp); sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
}); #endif
} }
template<class obj1,class obj2,class obj3> inline }
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.checkerboard = lhs.checkerboard;
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); #ifdef STREAMING_STORES
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ obj1 tmp;
decltype(coalescedRead(obj1())) tmp; add(&tmp,&lhs._odata[ss],&rhs);
auto lhs_t=lhs_v(ss); vstream(ret._odata[ss],tmp);
add(&tmp,&lhs_t,&rhs); #else
coalescedWrite(ret_v[ss],tmp); add(&ret._odata[ss],&lhs._odata[ss],&rhs);
}); #endif
}
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add // avoid copy back routines for mult, mac, sub, add
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
auto rhs_v = lhs.View(); #ifdef STREAMING_STORES
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ obj1 tmp;
decltype(coalescedRead(obj1())) tmp; mult(&tmp,&lhs,&rhs._odata[ss]);
auto rhs_t=rhs_v(ss); vstream(ret._odata[ss],tmp);
mult(&tmp,&lhs,&rhs_t); #else
coalescedWrite(ret_v[ss],tmp); mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}); #endif
}
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
auto rhs_v = lhs.View(); #ifdef STREAMING_STORES
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ obj1 tmp;
decltype(coalescedRead(obj1())) tmp; mac(&tmp,&lhs,&rhs._odata[ss]);
auto rhs_t=rhs_v(ss); vstream(ret._odata[ss],tmp);
mac(&tmp,&lhs,&rhs_t); #else
coalescedWrite(ret_v[ss],tmp); mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}); #endif
}
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> strong_inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
auto rhs_v = lhs.View(); #ifdef STREAMING_STORES
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ obj1 tmp;
decltype(coalescedRead(obj1())) tmp; sub(&tmp,&lhs,&rhs._odata[ss]);
auto rhs_t=rhs_v(ss); vstream(ret._odata[ss],tmp);
sub(&tmp,&lhs,&rhs_t); #else
coalescedWrite(ret_v[ss],tmp); sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}); #endif
} }
template<class obj1,class obj2,class obj3> inline }
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.checkerboard = rhs.checkerboard;
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
auto rhs_v = lhs.View(); #ifdef STREAMING_STORES
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ obj1 tmp;
decltype(coalescedRead(obj1())) tmp; add(&tmp,&lhs,&rhs._odata[ss]);
auto rhs_t=rhs_v(ss); vstream(ret._odata[ss],tmp);
add(&tmp,&lhs,&rhs_t); #else
coalescedWrite(ret_v[ss],tmp); add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
}); #endif
}
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> strong_inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard(); ret.checkerboard = x.checkerboard;
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<x._grid->oSites();ss++){
auto x_v = x.View(); #ifdef STREAMING_STORES
auto y_v = y.View(); vobj tmp = a*x._odata[ss]+y._odata[ss];
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ vstream(ret._odata[ss],tmp);
auto tmp = a*x_v(ss)+y_v(ss); #else
coalescedWrite(ret_v[ss],tmp); ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
}); #endif
} }
template<class sobj,class vobj> inline }
template<class sobj,class vobj> strong_inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard(); ret.checkerboard = x.checkerboard;
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<x._grid->oSites();ss++){
auto x_v = x.View(); #ifdef STREAMING_STORES
auto y_v = y.View(); vobj tmp = a*x._odata[ss]+b*y._odata[ss];
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ vstream(ret._odata[ss],tmp);
auto tmp = a*x_v(ss)+b*y_v(ss); #else
coalescedWrite(ret_v[ss],tmp); ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
}); #endif
}
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
{
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
{
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
} }
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -28,431 +28,311 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_LATTICE_BASE_H
#define GRID_LATTICE_BASE_H
#define STREAMING_STORES #define STREAMING_STORES
NAMESPACE_BEGIN(Grid); namespace Grid {
// TODO:
// mac,real,imag
// Functionality:
// -=,+=,*=,()
// add,+,sub,-,mult,mac,*
// adj,conjugate
// real,imag
// transpose,transposeIndex
// trace,traceIndex
// peekIndex
// innerProduct,outerProduct,
// localNorm2
// localInnerProduct
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
/////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour // Basic expressions used in Expression Template
/////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////
class LatticeBase {};
///////////////////////////////////////////////////////////////////////////////////////// class LatticeBase
// Conformable checks; same instance of Grid required {
///////////////////////////////////////////////////////////////////////////////////////// public:
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) virtual ~LatticeBase(void) = default;
GridBase *_grid;
};
class LatticeExpressionBase {};
template <typename Op, typename T1>
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
public:
LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {};
};
template <typename Op, typename T1, typename T2>
class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase {
public:
LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {};
};
template <typename Op, typename T1, typename T2, typename T3>
class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase {
public:
LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
};
void inline conformable(GridBase *lhs,GridBase *rhs)
{ {
assert(lhs == rhs); assert(lhs == rhs);
} }
//////////////////////////////////////////////////////////////////////////// template<class vobj>
// Minimal base class containing only data valid to access from accelerator class Lattice : public LatticeBase
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{ {
protected: public:
GridBase *_grid;
int checkerboard; int checkerboard;
vobj *_odata; // A managed pointer Vector<vobj> _odata;
uint64_t _odata_size;
// to pthread need a computable loop where loop induction is not required
int begin(void) { return 0;};
int end(void) { return _odata.size(); }
vobj & operator[](int i) { return _odata[i]; };
const vobj & operator[](int i) const { return _odata[i]; };
public: public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
/////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class Lattice : public LatticeAccelerator<vobj>
{
public:
GridBase *Grid(void) const { return this->_grid; }
///////////////////////////////////////////////////
// Member types
///////////////////////////////////////////////////
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef vobj vector_object; typedef vobj vector_object;
private:
void dealloc(void)
{
if( this->_odata_size ) {
alignedAllocator<vobj> alloc;
alloc.deallocate(this->_odata,this->_odata_size);
this->_odata=nullptr;
this->_odata_size=0;
}
}
void resize(uint64_t size)
{
if ( this->_odata_size != size ) {
alignedAllocator<vobj> alloc;
dealloc();
this->_odata_size = size;
if ( size )
this->_odata = alloc.allocate(this->_odata_size);
else
this->_odata = nullptr;
}
}
public:
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas
/////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
~Lattice() {
if ( this->_odata_size ) {
dealloc();
}
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Expression Template closure support // Expression Template closure support
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr) template <typename Op, typename T1> strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; checkerboard=cb;
auto me = View(); parallel_for(int ss=0;ss<_grid->oSites();ss++){
accelerator_for(ss,me.size(),1,{ #ifdef STREAMING_STORES
auto tmp = eval(ss,expr); vobj tmp = eval(ss,expr);
vstream(me[ss],tmp); vstream(_odata[ss] ,tmp);
}); #else
_odata[ss]=eval(ss,expr);
#endif
}
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; checkerboard=cb;
auto me = View(); parallel_for(int ss=0;ss<_grid->oSites();ss++){
accelerator_for(ss,me.size(),1,{ #ifdef STREAMING_STORES
auto tmp = eval(ss,expr); vobj tmp = eval(ss,expr);
vstream(me[ss],tmp); vstream(_odata[ss] ,tmp);
}); #else
_odata[ss]=eval(ss,expr);
#endif
}
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; checkerboard=cb;
auto me = View();
accelerator_for(ss,me.size(),1,{ parallel_for(int ss=0;ss<_grid->oSites();ss++){
auto tmp = eval(ss,expr); #ifdef STREAMING_STORES
vstream(me[ss],tmp); //vobj tmp = eval(ss,expr);
}); vstream(_odata[ss] ,eval(ss,expr));
#else
_odata[ss] = eval(ss,expr);
#endif
}
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
template<class Op,class T1> template<class Op,class T1>
Lattice(const LatticeUnaryExpression<Op,T1> & expr) { Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr; _grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(_grid,expr);
assert(this->_grid!=nullptr); assert(_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; checkerboard=cb;
resize(this->_grid->oSites()); _odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
*this = expr; #ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
} }
};
template<class Op,class T1, class T2> template<class Op,class T1, class T2>
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
this->_grid = nullptr; _grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(_grid,expr);
assert(this->_grid!=nullptr); assert(_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; checkerboard=cb;
resize(this->_grid->oSites()); _odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
*this = expr; #ifdef STREAMING_STORES
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
} }
};
template<class Op,class T1, class T2, class T3> template<class Op,class T1, class T2, class T3>
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
this->_grid = nullptr; _grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(_grid,expr);
assert(this->_grid!=nullptr); assert(_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; checkerboard=cb;
resize(this->_grid->oSites()); _odata.resize(_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
*this = expr; vstream(_odata[ss] ,eval(ss,expr));
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View();
thread_for(ss,me.size(),{
me[ss] = r;
});
return *this;
} }
};
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// Follow rule of five, with Constructor requires "grid" passed // Constructor requires "grid" passed.
// to user defined constructor // what about a default grid?
/////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// user defined constructor Lattice(GridBase *grid) : _odata(grid->oSites()) {
/////////////////////////////////////////// _grid = grid;
Lattice(GridBase *grid) { // _odata.reserve(_grid->oSites());
this->_grid = grid; // _odata.resize(_grid->oSites());
resize(this->_grid->oSites()); // std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&_odata[0])&0xF) ==0);
this->checkerboard=0; checkerboard=0;
} }
// virtual ~Lattice(void) = default; Lattice(const Lattice& r){ // copy constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
Lattice(Lattice&& r){ // move constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata=std::move(r._odata);
}
inline Lattice<vobj> & operator = (Lattice<vobj> && r)
{
_grid = r._grid;
checkerboard = r.checkerboard;
_odata =std::move(r._odata);
return *this;
}
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
virtual ~Lattice(void) = default;
void reset(GridBase* grid) { void reset(GridBase* grid) {
if (this->_grid != grid) { if (_grid != grid) {
this->_grid = grid; _grid = grid;
this->resize(grid->oSites()); _odata.resize(grid->oSites());
this->checkerboard = 0; checkerboard = 0;
} }
} }
///////////////////////////////////////////
// copy constructor
/////////////////////////////////////////// template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
Lattice(const Lattice& r){ parallel_for(int ss=0;ss<_grid->oSites();ss++){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl; this->_odata[ss]=r;
this->_grid = r.Grid();
resize(this->_grid->oSites());
*this = r;
} }
///////////////////////////////////////////
// move constructor
///////////////////////////////////////////
Lattice(Lattice && r){
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
}
///////////////////////////////////////////
// assignment template
///////////////////////////////////////////
template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this; return *this;
} }
///////////////////////////////////////////
// Copy assignment
///////////////////////////////////////////
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this;
}
///////////////////////////////////////////
// Move assignment possible if same type
///////////////////////////////////////////
inline Lattice<vobj> & operator = (Lattice<vobj> && r){
resize(0); // deletes if appropriate
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
return *this;
}
/////////////////////////////////////////////////////////////////////////////
// *=,+=,-= operators inherit behvour from correspond */+/- operation // *=,+=,-= operators inherit behvour from correspond */+/- operation
///////////////////////////////////////////////////////////////////////////// template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {
template<class T> inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r; *this = (*this)*r;
return *this; return *this;
} }
template<class T> inline Lattice<vobj> &operator -=(const T &r) { template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r; *this = (*this)-r;
return *this; return *this;
} }
template<class T> inline Lattice<vobj> &operator +=(const T &r) { template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r; *this = (*this)+r;
return *this; return *this;
} }
friend inline void swap(Lattice &l, Lattice &r) {
conformable(l,r);
LatticeAccelerator<vobj> tmp;
LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
LatticeAccelerator<vobj> *rp = (LatticeAccelerator<vobj> *)&r;
tmp = *lp; *lp=*rp; *rp=tmp;
}
}; // class Lattice }; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
std::vector<int> gcoor;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
sobj ss; sobj ss;
for(int g=0;g<o._grid->_gsites;g++){
o._grid->GlobalIndexToGlobalCoor(g,gcoor);
peekSite(ss,o,gcoor); peekSite(ss,o,gcoor);
stream<<"["; stream<<"[";
for(int d=0;d<gcoor.size();d++){ for(int d=0;d<gcoor.size();d++){
@@ -465,5 +345,31 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
return stream; return stream;
} }
NAMESPACE_END(Grid); }
#include "Lattice_conformable.h"
#define GRID_LATTICE_EXPRESSION_TEMPLATES
#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES
#include "Lattice_ET.h"
#else
#include "Lattice_overload.h"
#endif
#include "Lattice_arith.h"
#include "Lattice_trace.h"
#include "Lattice_transpose.h"
#include "Lattice_local.h"
#include "Lattice_reduction.h"
#include "Lattice_peekpoke.h"
#include "Lattice_reality.h"
#include "Lattice_comparison_utils.h"
#include "Lattice_comparison.h"
#include "Lattice_coordinate.h"
#include "Lattice_where.h"
#include "Lattice_rng.h"
#include "Lattice_unary.h"
#include "Lattice_transfer.h"
#endif

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_COMPARISON_H #ifndef GRID_LATTICE_COMPARISON_H
#define GRID_LATTICE_COMPARISON_H #define GRID_LATTICE_COMPARISON_H
NAMESPACE_BEGIN(Grid); namespace Grid {
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// relational operators // relational operators
@@ -40,78 +40,40 @@ NAMESPACE_BEGIN(Grid);
//Query supporting logical &&, ||, //Query supporting logical &&, ||,
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
*/
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to lattice // compare lattice to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj> template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vInteger> ret(rhs._grid);
auto lhs_v = lhs.View(); parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
auto rhs_v = rhs.View(); ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]);
auto ret_v = ret.View(); }
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
});
return ret; return ret;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to scalar // compare lattice to scalar
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj> template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vPredicate> ret(lhs.Grid()); Lattice<vInteger> ret(lhs._grid);
auto lhs_v = lhs.View(); parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
auto ret_v = ret.View(); ret._odata[ss]=op(lhs._odata[ss],rhs);
thread_for( ss, lhs_v.size(), { }
ret_v[ss]=op(lhs_v[ss],rhs);
});
return ret; return ret;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare scalar to lattice // compare scalar to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj> template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vInteger> ret(rhs._grid);
auto rhs_v = rhs.View(); parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
auto ret_v = ret.View(); ret._odata[ss]=op(lhs._odata[ss],rhs);
thread_for( ss, rhs_v.size(), { }
ret_v[ss]=op(lhs,rhs_v[ss]);
});
return ret; return ret;
} }
@@ -120,88 +82,88 @@ inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattic
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Less than // Less than
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs); return LLComparison(vlt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs); return LSComparison(vlt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs); return SLComparison(vlt<lobj,robj>(),lhs,rhs);
} }
// Less than equal // Less than equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs); return LLComparison(vle<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs); return LSComparison(vle<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs); return SLComparison(vle<lobj,robj>(),lhs,rhs);
} }
// Greater than // Greater than
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs); return LLComparison(vgt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs); return LSComparison(vgt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs); return SLComparison(vgt<lobj,robj>(),lhs,rhs);
} }
// Greater than equal // Greater than equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs); return LLComparison(vge<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs); return LSComparison(vge<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs); return SLComparison(vge<lobj,robj>(),lhs,rhs);
} }
// equal // equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs); return LLComparison(veq<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs); return LSComparison(veq<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs); return SLComparison(veq<lobj,robj>(),lhs,rhs);
} }
// not equal // not equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs); return LLComparison(vne<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs); return LSComparison(vne<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs); return SLComparison(vne<lobj,robj>(),lhs,rhs);
} }
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -26,10 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_COMPARISON_H
#define GRID_COMPARISON_H
#pragma once namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////// /////////////////////////////////////////
// This implementation is a bit poor. // This implementation is a bit poor.
@@ -44,42 +44,42 @@ NAMESPACE_BEGIN(Grid);
// //
template<class lobj,class robj> class veq { template<class lobj,class robj> class veq {
public: public:
accelerator vInteger operator()(const lobj &lhs, const robj &rhs) vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) == (rhs); return (lhs) == (rhs);
} }
}; };
template<class lobj,class robj> class vne { template<class lobj,class robj> class vne {
public: public:
accelerator vInteger operator()(const lobj &lhs, const robj &rhs) vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) != (rhs); return (lhs) != (rhs);
} }
}; };
template<class lobj,class robj> class vlt { template<class lobj,class robj> class vlt {
public: public:
accelerator vInteger operator()(const lobj &lhs, const robj &rhs) vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) < (rhs); return (lhs) < (rhs);
} }
}; };
template<class lobj,class robj> class vle { template<class lobj,class robj> class vle {
public: public:
accelerator vInteger operator()(const lobj &lhs, const robj &rhs) vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) <= (rhs); return (lhs) <= (rhs);
} }
}; };
template<class lobj,class robj> class vgt { template<class lobj,class robj> class vgt {
public: public:
accelerator vInteger operator()(const lobj &lhs, const robj &rhs) vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) > (rhs); return (lhs) > (rhs);
} }
}; };
template<class lobj,class robj> class vge { template<class lobj,class robj> class vge {
public: public:
accelerator vInteger operator()(const lobj &lhs, const robj &rhs) vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) >= (rhs); return (lhs) >= (rhs);
} }
@@ -88,42 +88,42 @@ NAMESPACE_BEGIN(Grid);
// Generic list of functors // Generic list of functors
template<class lobj,class robj> class seq { template<class lobj,class robj> class seq {
public: public:
accelerator Integer operator()(const lobj &lhs, const robj &rhs) Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) == (rhs); return (lhs) == (rhs);
} }
}; };
template<class lobj,class robj> class sne { template<class lobj,class robj> class sne {
public: public:
accelerator Integer operator()(const lobj &lhs, const robj &rhs) Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) != (rhs); return (lhs) != (rhs);
} }
}; };
template<class lobj,class robj> class slt { template<class lobj,class robj> class slt {
public: public:
accelerator Integer operator()(const lobj &lhs, const robj &rhs) Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) < (rhs); return (lhs) < (rhs);
} }
}; };
template<class lobj,class robj> class sle { template<class lobj,class robj> class sle {
public: public:
accelerator Integer operator()(const lobj &lhs, const robj &rhs) Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) <= (rhs); return (lhs) <= (rhs);
} }
}; };
template<class lobj,class robj> class sgt { template<class lobj,class robj> class sgt {
public: public:
accelerator Integer operator()(const lobj &lhs, const robj &rhs) Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) > (rhs); return (lhs) > (rhs);
} }
}; };
template<class lobj,class robj> class sge { template<class lobj,class robj> class sge {
public: public:
accelerator Integer operator()(const lobj &lhs, const robj &rhs) Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) >= (rhs); return (lhs) >= (rhs);
} }
@@ -133,12 +133,12 @@ NAMESPACE_BEGIN(Grid);
// Integer and real get extra relational functions. // Integer and real get extra relational functions.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); std::vector<scalar> vrhs(vsimd::Nsimd());
ExtractBuffer<Integer> vpred(vsimd::Nsimd()); std::vector<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(lhs,vlhs); extract<vsimd,scalar>(lhs,vlhs);
extract<vsimd,scalar>(rhs,vrhs); extract<vsimd,scalar>(rhs,vrhs);
@@ -150,11 +150,11 @@ NAMESPACE_BEGIN(Grid);
} }
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<Integer> vpred(vsimd::Nsimd()); std::vector<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(lhs,vlhs); extract<vsimd,scalar>(lhs,vlhs);
for(int s=0;s<vsimd::Nsimd();s++){ for(int s=0;s<vsimd::Nsimd();s++){
@@ -165,11 +165,11 @@ NAMESPACE_BEGIN(Grid);
} }
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation std::vector<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<Integer> vpred(vsimd::Nsimd()); std::vector<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(rhs,vrhs); extract<vsimd,scalar>(rhs,vrhs);
for(int s=0;s<vsimd::Nsimd();s++){ for(int s=0;s<vsimd::Nsimd();s++){
@@ -181,30 +181,30 @@ NAMESPACE_BEGIN(Grid);
#define DECLARE_RELATIONAL_EQ(op,functor) \ #define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd>\ template<class vsimd>\
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \ inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \ { \
return lhs._internal op rhs; \ return lhs._internal op rhs; \
} \ } \
template<class vsimd>\ template<class vsimd>\
accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \ inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \ { \
return lhs op rhs._internal; \ return lhs op rhs._internal; \
} \ } \
@@ -212,7 +212,7 @@ NAMESPACE_BEGIN(Grid);
#define DECLARE_RELATIONAL(op,functor) \ #define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \ DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\ template<class vsimd>\
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\ inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \ { \
return lhs._internal op rhs._internal; \ return lhs._internal op rhs._internal; \
} }
@@ -226,7 +226,7 @@ DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL #undef DECLARE_RELATIONAL
NAMESPACE_END(Grid); }
#endif

View File

@@ -28,13 +28,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_CONFORMABLE_H #ifndef GRID_LATTICE_CONFORMABLE_H
#define GRID_LATTICE_CONFORMABLE_H #define GRID_LATTICE_CONFORMABLE_H
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{ {
assert(lhs.Grid() == rhs.Grid()); assert(lhs._grid == rhs._grid);
assert(lhs.Checkerboard() == rhs.Checkerboard()); assert(lhs.checkerboard == rhs.checkerboard);
} }
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -25,49 +25,32 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_LATTICE_COORDINATE_H
#define GRID_LATTICE_COORDINATE_H
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu) template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
{ {
typedef typename iobj::scalar_type scalar_type; typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type; typedef typename iobj::vector_type vector_type;
GridBase *grid = l.Grid(); GridBase *grid = l._grid;
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
auto l_v = l.View(); std::vector<int> gcoor;
thread_for( o, grid->oSites(), { std::vector<scalar_type> mergebuf(Nsimd);
vector_type vI; vector_type vI;
Coordinate gcoor; for(int o=0;o<grid->oSites();o++){
ExtractBuffer<scalar_type> mergebuf(Nsimd);
for(int i=0;i<grid->iSites();i++){ for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu]; mergebuf[i]=(Integer)gcoor[mu];
} }
merge<vector_type,scalar_type>(vI,mergebuf); merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI; l._odata[o]=vI;
}); }
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
} }
}} #endif
}
NAMESPACE_END(Grid);

View File

@@ -32,7 +32,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// localInner, localNorm, outerProduct // localInner, localNorm, outerProduct
/////////////////////////////////////////////// ///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Non site, reduced locally reduced routines // Non site, reduced locally reduced routines
@@ -42,12 +42,10 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
auto rhs_v = rhs.View(); parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
auto ret_v = ret.View(); ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ }
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
});
return ret; return ret;
} }
@@ -55,33 +53,23 @@ inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tenso
template<class vobj> template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
auto lhs_v = lhs.View(); parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
auto rhs_v = rhs.View(); ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
auto ret_v = ret.View(); }
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
});
return ret; return ret;
} }
// outerProduct Scalar x Scalar -> Scalar // outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix // Vector x Vector -> Matrix
template<class ll,class rr> template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(ll(),rr()))> inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
{ {
typedef decltype(coalescedRead(ll())) sll; Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
typedef decltype(coalescedRead(rr())) srr; parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
auto lhs_v = lhs.View(); }
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop
ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]);
});
return ret; return ret;
} }
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -1,202 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
NAMESPACE_BEGIN(Grid);
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto Y_v = Y.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_loop_collapse2((int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
ComplexD z = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(z),imag(z));
}}
}});
thread_critical {
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,138 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_overload.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_OVERLOAD_H
#define GRID_LATTICE_OVERLOAD_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// unary negation
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
Lattice<vobj> ret(r._grid);
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
vstream(ret._odata[ss], -r._odata[ss]);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////
// Lattice BinOp Lattice,
//NB mult performs conformable check. Do not reapply here for performance.
/////////////////////////////////////////////////////////////////////////////////////
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
mult(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
// Scalar BinOp Lattice ;generate return type
template<class left,class right>
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs*rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs+rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
}
return ret;
}
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]*rhs;
}
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]+rhs;
}
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
}
#endif

View File

@@ -34,35 +34,29 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
// Peeking and poking around // Peeking and poking around
/////////////////////////////////////////////// ///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid); namespace Grid {
// FIXME accelerator_loop and accelerator_inline these
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Peek internal indices of a Lattice object // Peek internal indices of a Lattice object
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(vobj(),i))> auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))>
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid);
ret.Checkerboard()=lhs.Checkerboard(); ret.checkerboard=lhs.checkerboard;
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i);
thread_for( ss, lhs_v.size(), { }
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
});
return ret; return ret;
}; };
template<int Index,class vobj> template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(vobj(),i,j))> auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))>
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid);
ret.Checkerboard()=lhs.Checkerboard(); ret.checkerboard=lhs.checkerboard;
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j);
thread_for( ss, lhs_v.size(), { }
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
});
return ret; return ret;
}; };
@@ -70,38 +64,34 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
// Poke internal indices of a Lattice object // Poke internal indices of a Lattice object
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i)
{ {
auto rhs_v = rhs.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i);
thread_for( ss, lhs_v.size(), { }
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
});
} }
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j)
{ {
auto rhs_v = rhs.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j);
thread_for( ss, lhs_v.size(), { }
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array // Poke a scalar object into the SIMD array
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj,class sobj> template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){ void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){
GridBase *grid=l.Grid(); GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx; int rank,odx,idx;
@@ -109,13 +99,13 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s); grid->Broadcast(grid->BossRank(),s);
std::vector<sobj> buf(Nsimd);
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf); extract(l._odata[odx],buf);
buf[idx] = s; buf[idx] = s;
merge(l_v[odx],buf); merge(l._odata[odx],buf);
} }
return; return;
@@ -126,23 +116,22 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
template<class vobj,class sobj> template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){ void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){
GridBase *grid=l.Grid(); GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); assert( l.checkerboard == l._grid->CheckerBoard(site));
int rank,odx,idx; int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); std::vector<sobj> buf(Nsimd);
auto l_v = l.View(); extract(l._odata[odx],buf);
extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@@ -156,16 +145,16 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
template<class vobj,class sobj> template<class vobj,class sobj>
accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){ void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid = l.Grid(); GridBase *grid = l._grid;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -173,8 +162,7 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@@ -185,16 +173,16 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){ void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){
GridBase *grid=l.Grid(); GridBase *grid=l._grid;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -202,9 +190,9 @@ accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; vp[idx+w*Nsimd] = pt[w];
} }
@@ -212,6 +200,6 @@ accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate
return; return;
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -36,28 +36,22 @@ Author: neo <cossu@post.kek.jp>
// The choice of burying complex in the SIMD // The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome // is making the use of "real" and "imag" very cumbersome
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs._grid);
auto lhs_v = lhs.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto ret_v = ret.View(); ret._odata[ss] = adj(lhs._odata[ss]);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { }
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
});
return ret; return ret;
}; };
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs._grid);
auto lhs_v = lhs.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto ret_v = ret.View(); ret._odata[ss] = conjugate(lhs._odata[ss]);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { }
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
});
return ret; return ret;
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -19,76 +19,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_LATTICE_REDUCTION_H
#define GRID_LATTICE_REDUCTION_H
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
namespace Grid {
#ifdef GRID_NVCC #ifdef GRID_WARN_SUBOPTIMAL
#include <Grid/lattice/Lattice_reduction_gpu.h> #warning "Optimisation alert all these reduction loops are NOT threaded "
#endif #endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
// FIXME this should promote to double and accumulate
//////////////////////////////////////////////////////
template<class vobj>
inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto arg_v = arg.View();
Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations // Deterministic Reduction operations
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg); auto nrm = innerProduct(arg,arg);
return real(nrm); return std::real(nrm);
} }
// Double inner product // Double inner product
@@ -97,49 +43,32 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
ComplexD nrm; GridBase *grid = left._grid;
const int pad = 8;
GridBase *grid = left.Grid(); ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
// Might make all code paths go this way. parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
auto left_v = left.View(); int nwork, mywork, myoff;
auto right_v=right.View(); GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
const uint64_t nsimd = grid->Nsimd(); decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
const uint64_t sites = grid->oSites(); for(int ss=myoff;ss<mywork+myoff; ss++){
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
}
// All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
#ifdef GRID_NVCC inner=0.0;
// GPU - SIMT lane compliance... for(int i=0;i<grid->SumArraySize();i++){
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; inner = inner+sumarray[i*pad];
Vector<inner_t> inner_tmp(sites); }
auto inner_tmp_v = &inner_tmp[0]; right._grid->GlobalSum(inner);
return inner;
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
return nrm;
} }
///////////////////////// /////////////////////////
@@ -157,7 +86,8 @@ axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj
template<class sobj,class vobj> strong_inline RealD template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
z.Checkerboard() = x.Checkerboard(); const int pad = 8;
z.checkerboard = x.checkerboard;
conformable(z,x); conformable(z,x);
conformable(x,y); conformable(x,y);
@@ -165,57 +95,43 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
RealD nrm; RealD nrm;
GridBase *grid = x.Grid(); GridBase *grid = x._grid;
auto x_v=x.View(); Vector<RealD> sumarray(grid->SumArraySize()*pad);
auto y_v=y.View();
auto z_v=z.View();
const uint64_t nsimd = grid->Nsimd(); parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
const uint64_t sites = grid->oSites(); int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
#ifdef GRID_NVCC // private to thread; sub summation
// GPU decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; for(int ss=myoff;ss<mywork+myoff; ss++){
Vector<inner_t> inner_tmp(sites); vobj tmp = a*x._odata[ss]+b*y._odata[ss];
auto inner_tmp_v = &inner_tmp[0]; vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
accelerator_for( ss, sites, nsimd,{ nrm = 0.0; // sum across threads; linear in thread count but fast
auto tmp = a*x_v(ss)+b*y_v(ss); for(int i=0;i<grid->SumArraySize();i++){
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); nrm = nrm+sumarray[i*pad];
coalescedWrite(z_v[ss],tmp); }
}); z._grid->GlobalSum(nrm);
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
});
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
template<class Op,class T1,class T2> template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr) inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
@@ -223,14 +139,54 @@ inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
template<class Op,class T1,class T2,class T3> template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1), ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,expr.arg2), eval(0,std::get<1>(expr.second)),
eval(0,expr.arg3) eval(0,std::get<2>(expr.second))
))::scalar_object ))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -243,7 +199,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields // But easily avoided by using double precision fields
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *grid = Data.Grid(); GridBase *grid = Data._grid;
assert(grid!=NULL); assert(grid!=NULL);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
@@ -256,13 +212,13 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD std::vector<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=Zero(); lvSum[r]=zero;
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
@@ -271,19 +227,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
auto Data_v=Data.View(); parallel_for(int r=0;r<rd;r++){
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss]; lvSum[r]=lvSum[r]+Data._odata[ss];
}
} }
} }
});
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd); std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
@@ -308,7 +265,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
if ( pt == grid->_processor_coor[orthogdim] ) { if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt]; gsum=lsSum[lt];
} else { } else {
gsum=Zero(); gsum=zero;
} }
grid->GlobalSum(gsum); grid->GlobalSum(gsum);
@@ -317,14 +274,123 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
} }
} }
template<class vobj>
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
// std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
typedef typename vobj::scalar_type scalar_type;
std::vector<scalar_type> lsSum;
localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
// std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
}
template <class vobj>
static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
// std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl;
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type>> extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv;
parallel_for(int r=0;r<rd;r++)
{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
lvSum[r] = lvSum[r] + vv;
}
}
}
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
}
template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
int fd = result.size();
int ld = lsSum.size();
// sum over nodes.
std::vector<scalar_type> gsum;
gsum.resize(fd, scalar_type(0.0));
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum[t]=lsSum[lt];
}
}
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
grid->GlobalSumVector(&gsum[0], fd);
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
result = gsum;
}
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid(); GridBase *grid = lhs._grid;
assert(grid!=NULL); assert(grid!=NULL);
conformable(grid,rhs.Grid()); conformable(grid,rhs._grid);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
@@ -336,36 +402,34 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
Vector<vector_type> lvSum(rd); // will locally sum vectors first std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=Zero(); lvSum[r]=zero;
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View(); parallel_for(int r=0;r<rd;r++){
auto rhv=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss])); vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
lvSum[r]=lvSum[r]+vv; lvSum[r]=lvSum[r]+vv;
} }
} }
}); }
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd); std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp; iScalar<vector_type> temp;
@@ -406,7 +470,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = rhs.Grid()->GlobalDimensions()[Orthog]; int Nblock = rhs._grid->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock); std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock); sn.resize(Nblock);
@@ -428,7 +492,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
scalar_type zscale(scale); scalar_type zscale(scale);
GridBase *grid = X.Grid(); GridBase *grid = X._grid;
int Nsimd =grid->Nsimd(); int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim]; int Nblock =grid->GlobalDimensions()[orthogdim];
@@ -441,7 +505,8 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
int e2 =grid->_slice_block [orthogdim]; int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim]; int stride =grid->_slice_stride[orthogdim];
Coordinate icoor; std::vector<int> icoor;
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@@ -457,15 +522,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
auto Rv=R.View(); parallel_for_nest2(int n=0;n<e1;n++){
auto Xv=X.View();
auto Yv=Y.View();
thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss]; R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
}
} }
});
} }
}; };
@@ -497,18 +559,18 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid(); GridBase *FullGrid = X._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension; int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
// int nl = nh-1; int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -516,31 +578,28 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
auto X_v=X.View();
auto Y_v=Y.View();
auto R_v=R.View();
thread_region
{ {
Vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, { #pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride]; s_x[i] = X[o+i*ostride];
} }
vobj dot; vobj dot;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride]; dot = Y[o+i*ostride];
for(int j=0;j<Nblock;j++){ for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i)); dot = dot + s_x[j]*(scale*aa(j,i));
} }
R_v[o+i*ostride]=dot; R[o+i*ostride]=dot;
} }
}}); }}
} }
}; };
@@ -551,17 +610,17 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid(); GridBase *FullGrid = X._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension; int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
// int nl=1; int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -569,19 +628,17 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View(); #pragma omp parallel
auto X_v = X.View();
thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
thread_for_collapse_in_region( 2 ,n,nblock,{ for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride]; s_x[i] = X[o+i*ostride];
} }
vobj dot; vobj dot;
@@ -590,10 +647,11 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
for(int j=1;j<Nblock;j++){ for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i)); dot = dot + s_x[j]*(scale*aa(j,i));
} }
R_v[o+i*ostride]=dot; R[o+i*ostride]=dot;
} }
}}); }}
} }
}; };
@@ -604,7 +662,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid(); GridBase *FullGrid = lhs._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog]; int Nblock = FullGrid->GlobalDimensions()[Orthog];
@@ -615,9 +673,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension; int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
// int nl = nh-1; int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -628,33 +686,31 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View(); #pragma omp parallel
auto rhs_v=rhs.View();
thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{ #pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride]; Left [i] = lhs[o+i*ostride];
Right[i] = rhs_v[o+i*ostride]; Right[i] = rhs[o+i*ostride];
} }
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){ for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]); auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp); auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp); mat_thread(i,j) += Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}} }}
}}); }}
thread_critical #pragma omp critical
{ {
mat += mat_thread; mat += mat_thread;
} }
@@ -670,8 +726,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
return; return;
} }
NAMESPACE_END(Grid); } /*END NAMESPACE GRID*/
#endif

View File

@@ -1,226 +0,0 @@
NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32
extern cudaDeviceProp *gpu_props;
__device__ unsigned int retirementCount = 0;
template <class Iterator>
unsigned int nextPow2(Iterator x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
cudaGetDevice(&device);
Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
}
// let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize;
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
}
template <class sobj, class Iterator>
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
Iterator blockSize = blockDim.x;
// cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp();
const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1);
sobj beta, temp;
memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
for (int i = VEC/2; i > 0; i>>=1) {
if (vid < i) {
memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
}
__syncwarp();
}
__syncthreads();
if (threadIdx.x == 0) {
beta = Zero();
for (Iterator i = 0; i < blockSize; i += VEC) {
memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
beta += temp;
}
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
}
__syncthreads();
}
template <class vobj, class sobj, class Iterator>
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
{
constexpr Iterator nsimd = vobj::Nsimd();
Iterator blockSize = blockDim.x;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *sdata = (sobj *)shmem_pointer;
// first level of reduction,
// each thread writes result in mySum
Iterator tid = threadIdx.x;
Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
Iterator gridSize = blockSize*2*gridDim.x;
sobj mySum = Zero();
while (i < n) {
Iterator lane = i % nsimd;
Iterator ss = i / nsimd;
auto tmp = extractLane(lane,g_idata[ss]);
sobj tmpD;
tmpD=tmp;
mySum +=tmpD;
if (i + blockSize < n) {
lane = (i+blockSize) % nsimd;
ss = (i+blockSize) / nsimd;
tmp = extractLane(lane,g_idata[ss]);
tmpD = tmp;
mySum += tmpD;
}
i += gridSize;
}
// copy mySum to shared memory and perform
// reduction for all threads in this block
reduceBlock(sdata, mySum, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class vobj, class sobj,class Iterator>
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
Iterator blockSize = blockDim.x;
// perform reduction for this block and
// write result to global memory buffer
reduceBlocks(lat, buffer, n);
if (gridDim.x > 1) {
const Iterator tid = threadIdx.x;
__shared__ bool amLast;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished
__threadfence();
if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
// true if this block is the last block to be done
amLast = (ticket == gridDim.x-1);
}
// each thread must read the correct value of amLast
__syncthreads();
if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1]
Iterator i = tid;
sobj mySum = Zero();
while (i < gridDim.x) {
mySum += buffer[i];
i += blockSize;
}
reduceBlock(smem, mySum, tid);
if (tid==0) {
buffer[0] = smem[0];
// reset count variable
retirementCount = 0;
}
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0];
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@@ -41,7 +41,7 @@
#undef RNG_FAST_DISCARD #undef RNG_FAST_DISCARD
#endif #endif
NAMESPACE_BEGIN(Grid); namespace Grid {
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid // Allow the RNG state to be less dense than the fine grid
@@ -108,16 +108,12 @@ void fillScalar(scalar &s,distribution &dist,generator & gen)
template<class distribution,class generator> template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen) void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{ {
// s=ComplexF(dist(gen),dist(gen)); s=ComplexF(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
} }
template<class distribution,class generator> template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen) void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{ {
// s=ComplexD(dist(gen),dist(gen)); s=ComplexD(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
} }
class GridRNGbase { class GridRNGbase {
@@ -169,10 +165,7 @@ public:
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init // uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30; const int shift = 30;
//////////////////////////////////////////////////////////////////// uint64_t skip = site;
// Weird compiler bug in Intel 2018.1 under O3 was generating 32bit and not 64 bit left shift.
////////////////////////////////////////////////////////////////////
volatile uint64_t skip = site;
skip = skip<<shift; skip = skip<<shift;
@@ -263,7 +256,7 @@ public:
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
} };
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){ template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset(); dist[0].reset();
@@ -340,13 +333,13 @@ public:
}; };
class GridParallelRNG : public GridRNGbase { class GridParallelRNG : public GridRNGbase {
private:
double _time_counter; double _time_counter;
public:
GridBase *_grid; GridBase *_grid;
unsigned int _vol; unsigned int _vol;
public:
GridBase *Grid(void) const { return _grid; }
int generator_idx(int os,int is) { int generator_idx(int os,int is) {
return is*_grid->oSites()+os; return is*_grid->oSites()+os;
} }
@@ -370,14 +363,13 @@ public:
double inner_time_counter = usecond(); double inner_time_counter = usecond();
int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l._grid too
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l._grid->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View(); parallel_for(int ss=0;ss<osites;ss++){
thread_for( ss, osites, { std::vector<scalar_object> buf(Nsimd);
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int sm = multiplicity * ss + m; // Maps the generator site to the fine site int sm = multiplicity * ss + m; // Maps the generator site to the fine site
@@ -391,13 +383,12 @@ public:
fillScalar(pointer[idx], dist[gdx], _generators[gdx]); fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
} }
// merge into SIMD lanes, FIXME suboptimal implementation // merge into SIMD lanes, FIXME suboptimal implementation
merge(l_v[sm], buf); merge(l._odata[sm], buf);
}
} }
});
// });
_time_counter += usecond()- inner_time_counter; _time_counter += usecond()- inner_time_counter;
} };
void SeedUniqueString(const std::string &s){ void SeedUniqueString(const std::string &s){
std::vector<int> seeds; std::vector<int> seeds;
@@ -426,13 +417,12 @@ public:
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Everybody loops over global volume. // Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, { parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
// Where is it?
int rank; // Where is it?
int o_idx; int rank,o_idx,i_idx;
int i_idx; std::vector<int> gcoor;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
@@ -442,7 +432,8 @@ public:
_generators[l_idx] = master_engine; _generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
} }
});
}
#else #else
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient // Machine and thread decomposition dependent seeding is efficient
@@ -468,7 +459,7 @@ public:
seeders[t] = Reseed(master_engine); seeders[t] = Reseed(master_engine);
} }
thread_for( t, Nthread, { parallel_for(int t=0;t<Nthread;t++) {
// set up one per local site in threaded fashion // set up one per local site in threaded fashion
std::vector<uint32_t> newseeds; std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid; std::uniform_int_distribution<uint32_t> uid;
@@ -477,7 +468,7 @@ public:
_generators[l] = Reseed(seeders[t],newseeds,uid); _generators[l] = Reseed(seeders[t],newseeds,uid);
} }
} }
}); }
} }
#endif #endif
} }
@@ -495,8 +486,8 @@ public:
uint32_t the_number; uint32_t the_number;
// who // who
std::vector<int> gcoor;
int rank,o_idx,i_idx; int rank,o_idx,i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor); _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
@@ -521,5 +512,5 @@ template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fil
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -32,20 +32,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Tracing, transposing, peeking, poking // Tracing, transposing, peeking, poking
/////////////////////////////////////////////// ///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid); namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace // Trace
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs)
-> Lattice<decltype(trace(lhs._odata[0]))>
{ {
Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); ret._odata[ss] = trace(lhs._odata[ss]);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { }
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
});
return ret; return ret;
}; };
@@ -53,17 +52,16 @@ inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
// Trace Index level dependent operation // Trace Index level dependent operation
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))>
{ {
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { }
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
});
return ret; return ret;
}; };
NAMESPACE_END(Grid);
}
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@@ -33,19 +33,17 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Transpose // Transpose
/////////////////////////////////////////////// ///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid); namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose // Transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs._grid);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); ret._odata[ss] = transpose(lhs._odata[ss]);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ }
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
});
return ret; return ret;
}; };
@@ -53,16 +51,13 @@ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
// Index level dependent transpose // Index level dependent transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))>
{ {
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid);
auto ret_v = ret.View(); parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
auto lhs_v = lhs.View(); ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ }
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
});
return ret; return ret;
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@@ -31,50 +31,54 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_UNARY_H #ifndef GRID_LATTICE_UNARY_H
#define GRID_LATTICE_UNARY_H #define GRID_LATTICE_UNARY_H
NAMESPACE_BEGIN(Grid); namespace Grid {
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret(rhs._grid);
auto rhs = rhs_i.View(); ret.checkerboard = rhs.checkerboard;
auto ret = ret_i.View(); conformable(ret,rhs);
ret.Checkerboard() = rhs.Checkerboard(); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
accelerator_for(ss,rhs.size(),1,{ ret._odata[ss]=pow(rhs._odata[ss],y);
ret[ss]=pow(rhs[ss],y);
});
return ret_i;
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){ return ret;
Lattice<obj> ret_i(rhs_i.Grid()); }
auto rhs = rhs_i.View(); template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){
auto ret = ret_i.View(); Lattice<obj> ret(rhs._grid);
ret.Checkerboard() = rhs.Checkerboard(); ret.checkerboard = rhs.checkerboard;
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ conformable(ret,rhs);
coalescedWrite(ret[ss],mod(rhs(ss),y)); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
}); ret._odata[ss]=mod(rhs._odata[ss],y);
return ret_i; }
return ret;
} }
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret(rhs._grid);
auto ret = ret_i.View(); ret.checkerboard = rhs.checkerboard;
auto rhs = rhs_i.View(); conformable(ret,rhs);
ret.Checkerboard() = rhs_i.Checkerboard(); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ ret._odata[ss]=div(rhs._odata[ss],y);
coalescedWrite(ret[ss],div(rhs(ss),y)); }
}); return ret;
return ret_i;
} }
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret(rhs._grid);
auto rhs = rhs_i.View(); ret.checkerboard = rhs.checkerboard;
auto ret = ret_i.View(); conformable(ret,rhs);
ret.Checkerboard() = rhs.Checkerboard(); parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp);
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
});
return ret_i;
} }
NAMESPACE_END(Grid); return ret;
}
}
#endif #endif

View File

@@ -35,7 +35,7 @@ directory
#include <cxxabi.h> #include <cxxabi.h>
#include <memory> #include <memory>
NAMESPACE_BEGIN(Grid); namespace Grid {
std::string demangle(const char* name) { std::string demangle(const char* name) {
@@ -59,7 +59,6 @@ void GridLogTimestamp(int on){
} }
Colours GridLogColours(0); Colours GridLogColours(0);
GridLogger GridLogMG (1, "MG" , GridLogColours, "NORMAL");
GridLogger GridLogIRL (1, "IRL" , GridLogColours, "NORMAL"); GridLogger GridLogIRL (1, "IRL" , GridLogColours, "NORMAL");
GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL"); GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
GridLogger GridLogError (1, "Error" , GridLogColours, "RED"); GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
@@ -77,7 +76,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogIterative.Active(0); GridLogIterative.Active(0);
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(0);
GridLogColours.Active(0); GridLogColours.Active(0);
for (int i = 0; i < logstreams.size(); i++) { for (int i = 0; i < logstreams.size(); i++) {
@@ -86,7 +85,8 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance"))
GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1); if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
} }
@@ -109,9 +109,8 @@ void Grid_quiesce_nodes(void) {
} }
void Grid_unquiesce_nodes(void) { void Grid_unquiesce_nodes(void) {
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) #ifdef GRID_COMMS_MPI
std::cout.clear(); std::cout.clear();
#endif #endif
} }
NAMESPACE_END(Grid); }

View File

@@ -37,12 +37,13 @@
#include <execinfo.h> #include <execinfo.h>
#endif #endif
NAMESPACE_BEGIN(Grid); namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class // Dress the output; use std::chrono for time stamping via the StopWatch class
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{ class Colours{
protected: protected:
bool is_active; bool is_active;
@@ -145,11 +146,9 @@ public:
if ( log.timestamp ) { if ( log.timestamp ) {
log.StopWatch->Stop(); log.StopWatch->Stop();
GridTime now = log.StopWatch->Elapsed(); GridTime now = log.StopWatch->Elapsed();
if ( log.timing_mode==1 ) log.StopWatch->Reset(); if ( log.timing_mode==1 ) log.StopWatch->Reset();
log.StopWatch->Start(); log.StopWatch->Start();
stream << log.evidence() stream << log.evidence()<< std::setw(6)<<now << log.background() << " : " ;
<< now << log.background() << " : " ;
} }
stream << log.colour(); stream << log.colour();
return stream; return stream;
@@ -168,7 +167,6 @@ public:
void GridLogConfigure(std::vector<std::string> &logstreams); void GridLogConfigure(std::vector<std::string> &logstreams);
extern GridLogger GridLogMG;
extern GridLogger GridLogIRL; extern GridLogger GridLogIRL;
extern GridLogger GridLogSolver; extern GridLogger GridLogSolver;
extern GridLogger GridLogError; extern GridLogger GridLogError;
@@ -213,6 +211,6 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACE() BACKTRACEFP(stdout) #define BACKTRACE() BACKTRACEFP(stdout)
NAMESPACE_END(Grid);
}
#endif #endif

View File

@@ -1,3 +0,0 @@
#include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1;

View File

@@ -26,7 +26,8 @@
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO #define USE_MPI_IO
@@ -41,7 +42,8 @@
#include <arpa/inet.h> #include <arpa/inet.h>
#include <algorithm> #include <algorithm>
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Byte reversal garbage // Byte reversal garbage
@@ -79,7 +81,6 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// more byte manipulation helpers // more byte manipulation helpers
@@ -89,7 +90,7 @@ class BinaryIO {
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *grid = lat.Grid(); GridBase *grid = lat._grid;
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
@@ -109,20 +110,21 @@ class BinaryIO {
lsites = 1; lsites = 1;
} }
thread_region PARALLEL_REGION
{ {
uint32_t nersc_csum_thr = 0; uint32_t nersc_csum_thr = 0;
thread_for_in_region( local_site, lsites, PARALLEL_FOR_LOOP_INTERN
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{ {
uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++) for (uint64_t j = 0; j < size32; j++)
{ {
nersc_csum_thr = nersc_csum_thr + site_buf[j]; nersc_csum_thr = nersc_csum_thr + site_buf[j];
} }
}); }
thread_critical PARALLEL_CRITICAL
{ {
nersc_csum += nersc_csum_thr; nersc_csum += nersc_csum_thr;
} }
@@ -131,25 +133,28 @@ class BinaryIO {
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{ {
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
int nd = grid->_ndimension; int nd = grid->_ndimension;
uint64_t lsites =grid->lSites(); uint64_t lsites =grid->lSites();
if (fbuf.size()==1) { if (fbuf.size()==1) {
lsites=1; lsites=1;
} }
Coordinate local_vol =grid->LocalDimensions(); std::vector<int> local_vol =grid->LocalDimensions();
Coordinate local_start =grid->LocalStarts(); std::vector<int> local_start =grid->LocalStarts();
Coordinate global_vol =grid->FullDimensions(); std::vector<int> global_vol =grid->FullDimensions();
thread_region PARALLEL_REGION
{ {
Coordinate coor(nd); std::vector<int> coor(nd);
uint32_t scidac_csuma_thr=0; uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0; uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0; uint32_t site_crc=0;
thread_for_in_region( local_site, lsites, PARALLEL_FOR_LOOP_INTERN
{ for(uint64_t local_site=0;local_site<lsites;local_site++){
uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
@@ -176,9 +181,9 @@ class BinaryIO {
// std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl; // std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29); scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31); scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
}); }
thread_critical PARALLEL_CRITICAL
{ {
scidac_csuma^= scidac_csuma_thr; scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr; scidac_csumb^= scidac_csumb_thr;
@@ -196,23 +201,23 @@ class BinaryIO {
{ {
uint32_t * f = (uint32_t *)file_object; uint32_t * f = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t); uint64_t count = bytes/sizeof(uint32_t);
thread_for( i, count, { parallel_for(uint64_t i=0;i<count;i++){
f[i] = ntohl(f[i]); f[i] = ntohl(f[i]);
}); }
} }
// LE must Swap and switch to host // LE must Swap and switch to host
static inline void le32toh_v(void *file_object,uint64_t bytes) static inline void le32toh_v(void *file_object,uint64_t bytes)
{ {
uint32_t *fp = (uint32_t *)file_object; uint32_t *fp = (uint32_t *)file_object;
uint32_t f;
uint64_t count = bytes/sizeof(uint32_t); uint64_t count = bytes/sizeof(uint32_t);
thread_for(i,count,{ parallel_for(uint64_t i=0;i<count;i++){
uint32_t f;
f = fp[i]; f = fp[i];
// got network order and the network to host // got network order and the network to host
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = ntohl(f); fp[i] = ntohl(f);
}); }
} }
// BE is same as network // BE is same as network
@@ -220,18 +225,19 @@ class BinaryIO {
{ {
uint64_t * f = (uint64_t *)file_object; uint64_t * f = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t); uint64_t count = bytes/sizeof(uint64_t);
thread_for( i, count, { parallel_for(uint64_t i=0;i<count;i++){
f[i] = Grid_ntohll(f[i]); f[i] = Grid_ntohll(f[i]);
}); }
} }
// LE must swap and switch; // LE must swap and switch;
static inline void le64toh_v(void *file_object,uint64_t bytes) static inline void le64toh_v(void *file_object,uint64_t bytes)
{ {
uint64_t *fp = (uint64_t *)file_object; uint64_t *fp = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t);
thread_for( i, count, {
uint64_t f,g; uint64_t f,g;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
f = fp[i]; f = fp[i];
// got network order and the network to host // got network order and the network to host
g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
@@ -239,7 +245,7 @@ class BinaryIO {
f = f >> 32; f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = Grid_ntohll(g); fp[i] = Grid_ntohll(g);
}); }
} }
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Real action: // Real action:
@@ -275,13 +281,13 @@ class BinaryIO {
int nrank = grid->ProcessorCount(); int nrank = grid->ProcessorCount();
int myrank = grid->ThisRank(); int myrank = grid->ThisRank();
Coordinate psizes = grid->ProcessorGrid(); std::vector<int> psizes = grid->ProcessorGrid();
Coordinate pcoor = grid->ThisProcessorCoor(); std::vector<int> pcoor = grid->ThisProcessorCoor();
Coordinate gLattice= grid->GlobalDimensions(); std::vector<int> gLattice= grid->GlobalDimensions();
Coordinate lLattice= grid->LocalDimensions(); std::vector<int> lLattice= grid->LocalDimensions();
Coordinate lStart(ndim); std::vector<int> lStart(ndim);
Coordinate gStart(ndim); std::vector<int> gStart(ndim);
// Flatten the file // Flatten the file
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@@ -342,8 +348,7 @@ class BinaryIO {
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64"));
assert(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Do the I/O // Do the I/O
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -365,7 +370,7 @@ class BinaryIO {
#endif #endif
} else { } else {
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : " std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl; << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin; std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in); fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND) if (control & BINARYIO_MASTER_APPEND)
@@ -540,7 +545,7 @@ class BinaryIO {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu._grid;
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
@@ -552,7 +557,7 @@ class BinaryIO {
GridStopWatch timer; GridStopWatch timer;
timer.Start(); timer.Start();
thread_for(x,lsites, { munge(iodata[x], scalardata[x]); }); parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
vectorizeFromLexOrdArray(scalardata,Umu); vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier(); grid->Barrier();
@@ -576,10 +581,8 @@ class BinaryIO {
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu._grid;
uint64_t lsites = grid->lSites(), offsetCopy = offset; uint64_t lsites = grid->lSites();
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@@ -590,40 +593,13 @@ class BinaryIO {
GridStopWatch timer; timer.Start(); GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu); unvectorizeToLexOrdArray(scalardata,Umu);
thread_for(x, lsites, { munge(scalardata[x],iodata[x]); }); parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
grid->Barrier(); grid->Barrier();
timer.Stop(); timer.Stop();
while (attemptsLeft >= 0)
{
grid->Barrier();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
if (checkWrite)
{
std::vector<fobj> ckiodata(lsites);
uint32_t cknersc_csum, ckscidac_csuma, ckscidac_csumb;
uint64_t ckoffset = offsetCopy;
std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
grid->Barrier();
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
cknersc_csum,ckscidac_csuma,ckscidac_csumb);
if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
{
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
offset = offsetCopy;
thread_for(x,lsites, { munge(scalardata[x],iodata[x]); });
}
else
{
std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
break;
}
}
attemptsLeft--;
}
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
} }
@@ -631,8 +607,8 @@ class BinaryIO {
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Read a RNG; use IOobject and lexico map to an array of state // Read a RNG; use IOobject and lexico map to an array of state
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static inline void readRNG(GridSerialRNG &serial_rng, static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel_rng, GridParallelRNG &parallel,
std::string file, std::string file,
uint64_t offset, uint64_t offset,
uint32_t &nersc_csum, uint32_t &nersc_csum,
@@ -646,7 +622,7 @@ class BinaryIO {
std::string format = "IEEE32BIG"; std::string format = "IEEE32BIG";
GridBase *grid = parallel_rng.Grid(); GridBase *grid = parallel._grid;
uint64_t gsites = grid->gSites(); uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@@ -663,11 +639,11 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
thread_for(lidx,lsites,{ parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel_rng.SetState(tmp,lidx); parallel.SetState(tmp,lidx);
}); }
timer.Stop(); timer.Stop();
iodata.resize(1); iodata.resize(1);
@@ -677,7 +653,7 @@ class BinaryIO {
{ {
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin()); std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
serial_rng.SetState(tmp,0); serial.SetState(tmp,0);
} }
nersc_csum = nersc_csum + nersc_csum_tmp; nersc_csum = nersc_csum + nersc_csum_tmp;
@@ -693,8 +669,8 @@ class BinaryIO {
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Write a RNG; lexico map to an array of state and use IOobject // Write a RNG; lexico map to an array of state and use IOobject
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static inline void writeRNG(GridSerialRNG &serial_rng, static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel_rng, GridParallelRNG &parallel,
std::string file, std::string file,
uint64_t offset, uint64_t offset,
uint32_t &nersc_csum, uint32_t &nersc_csum,
@@ -706,7 +682,7 @@ class BinaryIO {
const int RngStateCount = GridSerialRNG::RngStateCount; const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate; typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel_rng.Grid(); GridBase *grid = parallel._grid;
uint64_t gsites = grid->gSites(); uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@@ -721,11 +697,11 @@ class BinaryIO {
timer.Start(); timer.Start();
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{ parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
parallel_rng.GetState(tmp,lidx); parallel.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}); }
timer.Stop(); timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
@@ -733,7 +709,7 @@ class BinaryIO {
iodata.resize(1); iodata.resize(1);
{ {
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
serial_rng.GetState(tmp,0); serial.GetState(tmp,0);
std::copy(tmp.begin(),tmp.end(),iodata[0].begin()); std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
} }
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
@@ -749,5 +725,5 @@ class BinaryIO {
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl; std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
} }
}; };
}
NAMESPACE_END(Grid); #endif

View File

@@ -24,7 +24,8 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_ILDG_IO_H
#define GRID_ILDG_IO_H
#ifdef HAVE_LIME #ifdef HAVE_LIME
#include <algorithm> #include <algorithm>
@@ -42,13 +43,8 @@ extern "C" {
#include "lime.h" #include "lime.h"
} }
NAMESPACE_BEGIN(Grid); namespace Grid {
namespace QCD {
#define GRID_FIELD_NORM "FieldNormMetaData"
#define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \
0.5*fabs(FieldNormMetaData_.norm2 - n2ck)/(FieldNormMetaData_.norm2 + n2ck)
#define GRID_FIELD_NORM_CHECK(FieldNormMetaData_, n2ck) \
assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
///////////////////////////////// /////////////////////////////////
// Encode word types as strings // Encode word types as strings
@@ -138,7 +134,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
///////////////////////////////////// /////////////////////////////////////
// Scidac Private File structure // Scidac Private File structure
///////////////////////////////////// /////////////////////////////////////
_scidacFile = scidacFile(field.Grid()); _scidacFile = scidacFile(field._grid);
///////////////////////////////////// /////////////////////////////////////
// Scidac Private Record structure // Scidac Private Record structure
@@ -209,7 +205,6 @@ class GridLimeReader : public BinaryIO {
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
scidacChecksum scidacChecksum_; scidacChecksum scidacChecksum_;
FieldNormMetaData FieldNormMetaData_;
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
std::string format = getFormatString<vobj>(); std::string format = getFormatString<vobj>();
@@ -225,10 +220,10 @@ class GridLimeReader : public BinaryIO {
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl; // std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field.Grid()->_gsites; uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl; // std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field.Grid()->_gsites<<std::endl; // std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl; // std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl; // std::cout << "R file size " <<file_bytes <<std::endl;
@@ -238,52 +233,20 @@ class GridLimeReader : public BinaryIO {
// std::cout << " ReadLatticeObject from offset "<<offset << std::endl; // std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
BinarySimpleMunger<sobj,sobj> munge; BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
///////////////////////////////////////////// /////////////////////////////////////////////
// Insist checksum is next record // Insist checksum is next record
///////////////////////////////////////////// /////////////////////////////////////////////
readScidacChecksum(scidacChecksum_,FieldNormMetaData_); readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
///////////////////////////////////////////// /////////////////////////////////////////////
// Verify checksums // Verify checksums
///////////////////////////////////////////// /////////////////////////////////////////////
if(FieldNormMetaData_.norm2 != 0.0){
RealD n2ck = norm2(field);
std::cout << GridLogMessage << "Field norm: metadata= " << FieldNormMetaData_.norm2
<< " / field= " << n2ck << " / rdiff= " << GRID_FIELD_NORM_CALC(FieldNormMetaData_,n2ck) << std::endl;
GRID_FIELD_NORM_CHECK(FieldNormMetaData_,n2ck);
}
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1); assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
// find out if next field is a GridFieldNorm
return; return;
} }
} }
} }
void readScidacChecksum(scidacChecksum &scidacChecksum_,
FieldNormMetaData &FieldNormMetaData_)
{
FieldNormMetaData_.norm2 =0.0;
std::string scidac_str(SCIDAC_CHECKSUM);
std::string field_norm_str(GRID_FIELD_NORM);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
std::string xmlstring = std::string(&xmlc[0]);
XmlReader RD(xmlstring, true, "");
if ( !strncmp(limeReaderType(LimeR), field_norm_str.c_str(),strlen(field_norm_str.c_str()) ) ) {
// std::cout << "FieldNormMetaData "<<xmlstring<<std::endl;
read(RD,field_norm_str,FieldNormMetaData_);
}
if ( !strncmp(limeReaderType(LimeR), scidac_str.c_str(),strlen(scidac_str.c_str()) ) ) {
// std::cout << SCIDAC_CHECKSUM << " " <<xmlstring<<std::endl;
read(RD,std::string("scidacChecksum"),scidacChecksum_);
return;
}
}
assert(0);
}
//////////////////////////////////////////// ////////////////////////////////////////////
// Read a generic serialisable object // Read a generic serialisable object
//////////////////////////////////////////// ////////////////////////////////////////////
@@ -404,7 +367,7 @@ class GridLimeWriter : public BinaryIO
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Write a generic lattice field and csum // Write a generic lattice field and csum
// This routine is Collectively called by all nodes // This routine is Collectively called by all nodes
// in communicator used by the field.Grid() // in communicator used by the field._grid
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template<class vobj> template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name) void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
@@ -423,10 +386,8 @@ class GridLimeWriter : public BinaryIO
// v) Continue writing scidac record. // v) Continue writing scidac record.
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
GridBase *grid = field.Grid(); GridBase *grid = field._grid;
assert(boss_node == field.Grid()->IsBoss() ); assert(boss_node == field._grid->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
//////////////////////////////////////////// ////////////////////////////////////////////
// Create record header // Create record header
@@ -441,7 +402,7 @@ class GridLimeWriter : public BinaryIO
} }
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl; // std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field.Grid()->_gsites<<std::endl; // std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl; // std::cout << "W Payload expected " <<PayloadSize<<std::endl;
//////////////////////////////////////////////// ////////////////////////////////////////////////
@@ -486,7 +447,6 @@ class GridLimeWriter : public BinaryIO
checksum.suma= streama.str(); checksum.suma= streama.str();
checksum.sumb= streamb.str(); checksum.sumb= streamb.str();
if ( boss_node ) { if ( boss_node ) {
writeLimeObject(0,0,FNMD,std::string(GRID_FIELD_NORM),std::string(GRID_FIELD_NORM));
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM)); writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
} }
} }
@@ -513,7 +473,7 @@ class ScidacWriter : public GridLimeWriter {
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord, void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0) const unsigned int recordScientificPrec = 0)
{ {
GridBase * grid = field.Grid(); GridBase * grid = field._grid;
//////////////////////////////////////// ////////////////////////////////////////
// fill the Grid header // fill the Grid header
@@ -555,7 +515,7 @@ class ScidacReader : public GridLimeReader {
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase * grid = field.Grid(); GridBase * grid = field._grid;
//////////////////////////////////////// ////////////////////////////////////////
// fill the Grid header // fill the Grid header
@@ -622,7 +582,7 @@ class IldgWriter : public ScidacWriter {
template <class vsimd> template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{ {
GridBase * grid = Umu.Grid(); GridBase * grid = Umu._grid;
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj; typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@@ -664,12 +624,6 @@ class IldgWriter : public ScidacWriter {
assert(header.nd==4); assert(header.nd==4);
assert(header.nd==header.dimension.size()); assert(header.nd==header.dimension.size());
//////////////////////////////////////////////////////////////////////////////
// Field norm tests
//////////////////////////////////////////////////////////////////////////////
FieldNormMetaData FieldNormMetaData_;
FieldNormMetaData_.norm2 = norm2(Umu);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Fill the USQCD info field // Fill the USQCD info field
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -678,12 +632,11 @@ class IldgWriter : public ScidacWriter {
info.plaq = header.plaquette; info.plaq = header.plaquette;
info.linktr = header.link_trace; info.linktr = header.link_trace;
// std::cout << GridLogMessage << " Writing config; IldgIO n2 "<< FieldNormMetaData_.norm2<<std::endl; std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
////////////////////////////////////////////// //////////////////////////////////////////////
// Fill the Lime file record by record // Fill the Lime file record by record
////////////////////////////////////////////// //////////////////////////////////////////////
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,FieldNormMetaData_,FieldNormMetaData_.SerialisableClassName(),std::string(GRID_FIELD_NORM));
writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
@@ -715,9 +668,9 @@ class IldgReader : public GridLimeReader {
typedef LorentzColourMatrixF fobj; typedef LorentzColourMatrixF fobj;
typedef LorentzColourMatrixD dobj; typedef LorentzColourMatrixD dobj;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu._grid;
Coordinate dims = Umu.Grid()->FullDimensions(); std::vector<int> dims = Umu._grid->FullDimensions();
assert(dims.size()==4); assert(dims.size()==4);
@@ -726,7 +679,6 @@ class IldgReader : public GridLimeReader {
std::string ildgLFN_ ; std::string ildgLFN_ ;
scidacChecksum scidacChecksum_; scidacChecksum scidacChecksum_;
usqcdInfo usqcdInfo_ ; usqcdInfo usqcdInfo_ ;
FieldNormMetaData FieldNormMetaData_;
// track what we read from file // track what we read from file
int found_ildgFormat =0; int found_ildgFormat =0;
@@ -735,7 +687,7 @@ class IldgReader : public GridLimeReader {
int found_usqcdInfo =0; int found_usqcdInfo =0;
int found_ildgBinary =0; int found_ildgBinary =0;
int found_FieldMetaData =0; int found_FieldMetaData =0;
int found_FieldNormMetaData =0;
uint32_t nersc_csum; uint32_t nersc_csum;
uint32_t scidac_csuma; uint32_t scidac_csuma;
uint32_t scidac_csumb; uint32_t scidac_csumb;
@@ -822,17 +774,11 @@ class IldgReader : public GridLimeReader {
found_scidacChecksum = 1; found_scidacChecksum = 1;
} }
if ( !strncmp(limeReaderType(LimeR), GRID_FIELD_NORM,strlen(GRID_FIELD_NORM)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,GRID_FIELD_NORM,FieldNormMetaData_);
found_FieldNormMetaData = 1;
}
} else { } else {
///////////////////////////////// /////////////////////////////////
// Binary data // Binary data
///////////////////////////////// /////////////////////////////////
// std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl; std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl;
uint64_t offset= ftello(File); uint64_t offset= ftello(File);
if ( format == std::string("IEEE64BIG") ) { if ( format == std::string("IEEE64BIG") ) {
GaugeSimpleMunger<dobj, sobj> munge; GaugeSimpleMunger<dobj, sobj> munge;
@@ -851,7 +797,6 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum // Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format // Since this is an ILDG reader require ILDG format
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
assert(found_ildgLFN);
assert(found_ildgBinary); assert(found_ildgBinary);
assert(found_ildgFormat); assert(found_ildgFormat);
assert(found_scidacChecksum); assert(found_scidacChecksum);
@@ -900,13 +845,6 @@ class IldgReader : public GridLimeReader {
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Really really want to mandate a scidac checksum // Really really want to mandate a scidac checksum
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
if ( found_FieldNormMetaData ) {
RealD nn = norm2(Umu);
GRID_FIELD_NORM_CHECK(FieldNormMetaData_,nn);
std::cout << GridLogMessage<<"FieldNormMetaData matches " << std::endl;
} else {
std::cout << GridLogWarning<<"FieldNormMetaData not found. " << std::endl;
}
if ( found_scidacChecksum ) { if ( found_scidacChecksum ) {
FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16); FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16); FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
@@ -929,9 +867,9 @@ class IldgReader : public GridLimeReader {
} }
}; };
NAMESPACE_END(Grid); }}
//HAVE_LIME //HAVE_LIME
#endif #endif
#endif

View File

@@ -32,7 +32,7 @@ extern "C" { // for linkage
#include "lime.h" #include "lime.h"
} }
NAMESPACE_BEGIN(Grid); namespace Grid {
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Data representation of records that enter ILDG and SciDac formats // Data representation of records that enter ILDG and SciDac formats
@@ -91,7 +91,7 @@ public:
return dimensions; return dimensions;
} }
void setDimensions(Coordinate dimensions) { void setDimensions(std::vector<int> dimensions) {
char delimiter = ' '; char delimiter = ' ';
std::stringstream stream; std::stringstream stream;
for(int i=0;i<dimensions.size();i++){ for(int i=0;i<dimensions.size();i++){
@@ -232,6 +232,6 @@ public:
}; };
#endif #endif
NAMESPACE_END(Grid); }
#endif #endif
#endif #endif

View File

@@ -36,7 +36,7 @@
#include <sys/utsname.h> #include <sys/utsname.h>
#include <pwd.h> #include <pwd.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// Precision mapping // Precision mapping
@@ -52,15 +52,10 @@ template<class vobj> static std::string getFormatString (void)
format = std::string("IEEE64BIG"); format = std::string("IEEE64BIG");
} }
return format; return format;
}; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// header specification/interpretation // header specification/interpretation
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
class FieldNormMetaData : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(FieldNormMetaData, double, norm2);
};
class FieldMetaData : Serializable { class FieldMetaData : Serializable {
public: public:
@@ -96,9 +91,10 @@ template<class vobj> static std::string getFormatString (void)
{} {}
}; };
// PB disable using namespace - this is a header and forces namesapce visibility for all namespace QCD {
// including files
//using namespace Grid; using namespace Grid;
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data // Bit and Physical Checksumming and QA of data
@@ -169,7 +165,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header) template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
{ {
GridBase *grid = field.Grid(); GridBase *grid = field._grid;
std::string format = getFormatString<vobj>(); std::string format = getFormatString<vobj>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
@@ -179,19 +175,19 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header) inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{ {
// How to convert data precision etc... // How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data); header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data); header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
} }
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header) inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{ {
// How to convert data precision etc... // How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data); header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data); header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
} }
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{ {
GridBase *grid = field.Grid(); GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixF>(); std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
@@ -201,7 +197,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzCo
} }
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{ {
GridBase *grid = field.Grid(); GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixD>(); std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
@@ -325,6 +321,7 @@ struct Gauge3x2unmunger{
} }
} }
}; };
}
NAMESPACE_END(Grid);
}

View File

@@ -30,7 +30,8 @@
#ifndef GRID_NERSC_IO_H #ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H #define GRID_NERSC_IO_H
NAMESPACE_BEGIN(Grid); namespace Grid {
namespace QCD {
using namespace Grid; using namespace Grid;
@@ -56,6 +57,7 @@ public:
// for the header-reader // for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field) static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{ {
uint64_t offset=0;
std::map<std::string,std::string> header; std::map<std::string,std::string> header;
std::string line; std::string line;
@@ -136,8 +138,8 @@ public:
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu._grid;
uint64_t offset = readHeader(file,Umu.Grid(),header); uint64_t offset = readHeader(file,Umu._grid,header);
FieldMetaData clone(header); FieldMetaData clone(header);
@@ -188,6 +190,8 @@ public:
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) { if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl; std::cout << " Plaquette mismatch "<<std::endl;
std::cout << Umu[0]<<std::endl;
std::cout << Umu[1]<<std::endl;
} }
if ( nersc_csum != header.checksum ) { if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl; std::cerr << " checksum mismatch " << std::endl;
@@ -225,7 +229,7 @@ public:
typedef LorentzColourMatrixD fobj3D; typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D; typedef LorentzColour2x3D fobj2D;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu._grid;
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
@@ -270,7 +274,7 @@ public:
header.ensemble_id = "UKQCD"; header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF"; header.ensemble_label = "DWF";
GridBase *grid = parallel.Grid(); GridBase *grid = parallel._grid;
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
@@ -317,7 +321,7 @@ public:
{ {
typedef typename GridParallelRNG::RngStateType RngStateType; typedef typename GridParallelRNG::RngStateType RngStateType;
GridBase *grid = parallel.Grid(); GridBase *grid = parallel._grid;
uint64_t offset = readHeader(file,grid,header); uint64_t offset = readHeader(file,grid,header);
@@ -352,8 +356,8 @@ public:
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl; std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
} }
}; };
NAMESPACE_END(Grid); }}
#endif #endif

View File

@@ -29,7 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B) #define RawConfig(A,B) (A<<8|B)
@@ -72,5 +72,4 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" }, // { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif #endif
}; };
NAMESPACE_END(Grid); }

View File

@@ -44,15 +44,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
#ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
#endif
NAMESPACE_BEGIN(Grid); namespace Grid {
#ifdef __linux__ #ifdef __linux__
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
@@ -96,6 +91,8 @@ inline uint64_t cyclecount(void){
#elif defined __x86_64__ #elif defined __x86_64__
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); return __rdtsc();
// unsigned int dummy;
// return __rdtscp(&dummy);
} }
#else #else
@@ -215,7 +212,7 @@ public:
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
ign=::read(fd, &count, sizeof(long long)); ign=::read(fd, &count, sizeof(long long));
ign+=::read(cyclefd, &cycles, sizeof(long long)); ign+=::read(cyclefd, &cycles, sizeof(long long));
assert(ign==2*sizeof(long long)); assert(ign=2*sizeof(long long));
} }
elapsed = cyclecount() - begin; elapsed = cyclecount() - begin;
#else #else
@@ -244,6 +241,5 @@ public:
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -2,7 +2,7 @@
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Stat.h> #include <Grid/perfmon/Stat.h>
NAMESPACE_BEGIN(Grid); namespace Grid {
bool PmuStat::pmu_initialized=false; bool PmuStat::pmu_initialized=false;
@@ -242,5 +242,4 @@ void PmuStat::KNLreadctrs(ctrs &c)
} }
#endif #endif
NAMESPACE_END(Grid); }

View File

@@ -5,7 +5,7 @@
#define _KNIGHTS_LANDING_ROOTONLY #define _KNIGHTS_LANDING_ROOTONLY
#endif #endif
NAMESPACE_BEGIN(Grid); namespace Grid {
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Extra KNL counters from MCDRAM // Extra KNL counters from MCDRAM
@@ -98,8 +98,7 @@ public:
}; };
NAMESPACE_END(Grid); }
#endif #endif

View File

@@ -33,9 +33,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <ctime> #include <ctime>
#include <chrono> #include <chrono>
NAMESPACE_BEGIN(Grid) namespace Grid {
// Dress the output; use std::chrono // Dress the output; use std::chrono
// C++11 time facilities better? // C++11 time facilities better?
inline double usecond(void) { inline double usecond(void) {
struct timeval tv; struct timeval tv;
@@ -47,38 +49,20 @@ inline double usecond(void) {
typedef std::chrono::system_clock GridClock; typedef std::chrono::system_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint; typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::seconds GridSecs;
typedef std::chrono::milliseconds GridMillisecs; typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridUsecs;
typedef std::chrono::microseconds GridTime; typedef std::chrono::microseconds GridTime;
typedef std::chrono::microseconds GridUsecs;
inline std::ostream& operator<< (std::ostream & stream, const GridSecs & time) inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
{ {
stream << time.count()<<" s"; stream << time.count()<<" ms";
return stream; return stream;
} }
inline std::ostream& operator<< (std::ostream & stream, const GridMillisecs & now) inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
{ {
GridSecs second(1); stream << time.count()<<" usec";
auto secs = now/second ;
auto subseconds = now%second ;
auto fill = stream.fill();
stream << secs<<"."<<std::setw(3)<<std::setfill('0')<<subseconds.count()<<" s";
stream.fill(fill);
return stream; return stream;
} }
inline std::ostream& operator<< (std::ostream & stream, const GridUsecs & now)
{
GridSecs second(1);
auto seconds = now/second ;
auto subseconds = now%second ;
auto fill = stream.fill();
stream << seconds<<"."<<std::setw(6)<<std::setfill('0')<<subseconds.count()<<" s";
stream.fill(fill);
return stream;
}
class GridStopWatch { class GridStopWatch {
private: private:
@@ -123,6 +107,5 @@ public:
} }
}; };
NAMESPACE_END(Grid) }
#endif #endif

View File

@@ -14,12 +14,7 @@
#ifndef SOURCE_PUGIXML_CPP #ifndef SOURCE_PUGIXML_CPP
#define SOURCE_PUGIXML_CPP #define SOURCE_PUGIXML_CPP
#ifdef __NVCC__ #include <Grid/pugixml/pugixml.h>
#pragma push
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#include "pugixml.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
@@ -207,7 +202,7 @@ PUGI__NS_BEGIN
// Without a template<> we'll get multiple definitions of the same static // Without a template<> we'll get multiple definitions of the same static
template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate; template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate; template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
template struct xml_memory_management_function_storage<int>;
typedef xml_memory_management_function_storage<int> xml_memory; typedef xml_memory_management_function_storage<int> xml_memory;
PUGI__NS_END PUGI__NS_END
@@ -12773,10 +12768,6 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC
#pragma pop
#endif
#endif #endif
/** /**

View File

@@ -29,53 +29,55 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_QCD_BASE_H
#define GRID_QCD_BASE_H
namespace Grid{
namespace QCD {
NAMESPACE_BEGIN(Grid); static const int Xdir = 0;
static const int Ydir = 1;
static const int Zdir = 2;
static const int Tdir = 3;
static constexpr int Xdir = 0;
static constexpr int Ydir = 1;
static constexpr int Zdir = 2;
static constexpr int Tdir = 3;
static constexpr int Xp = 0; static const int Xp = 0;
static constexpr int Yp = 1; static const int Yp = 1;
static constexpr int Zp = 2; static const int Zp = 2;
static constexpr int Tp = 3; static const int Tp = 3;
static constexpr int Xm = 4; static const int Xm = 4;
static constexpr int Ym = 5; static const int Ym = 5;
static constexpr int Zm = 6; static const int Zm = 6;
static constexpr int Tm = 7; static const int Tm = 7;
static constexpr int Nc=3; static const int Nc=3;
static constexpr int Ns=4; static const int Ns=4;
static constexpr int Nd=4; static const int Nd=4;
static constexpr int Nhs=2; // half spinor static const int Nhs=2; // half spinor
static constexpr int Nds=8; // double stored gauge field static const int Nds=8; // double stored gauge field
static constexpr int Ngp=2; // gparity index range static const int Ngp=2; // gparity index range
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// QCD iMatrix types // QCD iMatrix types
// Index conventions: Lorentz x Spin x Colour // Index conventions: Lorentz x Spin x Colour
// note: static constexpr int or constexpr will work for type deductions // note: static const int or constexpr will work for type deductions
// with the intel compiler (up to version 17) // with the intel compiler (up to version 17)
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
#define ColourIndex (2) #define ColourIndex 2
#define SpinIndex (1) #define SpinIndex 1
#define LorentzIndex (0) #define LorentzIndex 0
// Also should make these a named enum type // Also should make these a named enum type
static constexpr int DaggerNo=0; static const int DaggerNo=0;
static constexpr int DaggerYes=1; static const int DaggerYes=1;
static constexpr int InverseNo=0; static const int InverseNo=0;
static constexpr int InverseYes=1; static const int InverseYes=1;
// Useful traits is this a spin index // Useful traits is this a spin index
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
const int SpinorIndex = 2; const int SpinorIndex = 2;
template<typename T> struct isSpinor { template<typename T> struct isSpinor {
static constexpr bool value = (SpinorIndex==T::TensorLevel); static const bool value = (SpinorIndex==T::TensorLevel);
}; };
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ; template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
@@ -380,35 +382,35 @@ template<class vobj> auto peekLorentz(const Lattice<vobj> &rhs,int i) -> decltyp
////////////////////////////////////////////// //////////////////////////////////////////////
template<class vobj> template<class vobj>
void pokeColour(Lattice<vobj> &lhs, void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0))> & rhs, const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs,
int i) int i)
{ {
PokeIndex<ColourIndex>(lhs,rhs,i); PokeIndex<ColourIndex>(lhs,rhs,i);
} }
template<class vobj> template<class vobj>
void pokeColour(Lattice<vobj> &lhs, void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0,0))> & rhs, const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs,
int i,int j) int i,int j)
{ {
PokeIndex<ColourIndex>(lhs,rhs,i,j); PokeIndex<ColourIndex>(lhs,rhs,i,j);
} }
template<class vobj> template<class vobj>
void pokeSpin(Lattice<vobj> &lhs, void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0))> & rhs, const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs,
int i) int i)
{ {
PokeIndex<SpinIndex>(lhs,rhs,i); PokeIndex<SpinIndex>(lhs,rhs,i);
} }
template<class vobj> template<class vobj>
void pokeSpin(Lattice<vobj> &lhs, void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0,0))> & rhs, const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs,
int i,int j) int i,int j)
{ {
PokeIndex<SpinIndex>(lhs,rhs,i,j); PokeIndex<SpinIndex>(lhs,rhs,i,j);
} }
template<class vobj> template<class vobj>
void pokeLorentz(Lattice<vobj> &lhs, void pokeLorentz(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<LorentzIndex>(vobj(),0))> & rhs, const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs,
int i) int i)
{ {
PokeIndex<LorentzIndex>(lhs,rhs,i); PokeIndex<LorentzIndex>(lhs,rhs,i);
@@ -497,12 +499,12 @@ template<int Index,class vobj> inline vobj transposeColour(const vobj &lhs){
// Trace lattice and non-lattice // Trace lattice and non-lattice
////////////////////////////////////////// //////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))> inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))>
{ {
return traceIndex<SpinIndex>(lhs); return traceIndex<SpinIndex>(lhs);
} }
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))> inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))>
{ {
return traceIndex<ColourIndex>(lhs); return traceIndex<ColourIndex>(lhs);
} }
@@ -525,5 +527,9 @@ GRID_SERIALIZABLE_ENUM(Current, undef,
Axial, 1, Axial, 1,
Tadpole, 2); Tadpole, 2);
NAMESPACE_END(Grid); } //namespace QCD
} // Grid
#endif

Some files were not shown because too many files have changed in this diff Show More