1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-18 09:45:55 +01:00

Merge branch 'develop' of github.com:fionnoh/Grid into feature/A2A_current_insertion

Peter's GPU branch changes merged with A2A CI code
This commit is contained in:
Fionn O hOgain 2019-09-30 16:53:44 +01:00
commit d1daab601a
785 changed files with 41312 additions and 51680 deletions

View File

@ -30,8 +30,34 @@ directory
#ifndef DISABLE_WARNINGS_H #ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H #define DISABLE_WARNINGS_H
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
//disables and intel compiler specific warning (in json.hpp) //disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488 #pragma warning disable 488
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC
#ifdef __ALTIVEC__
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef __VSX__
#define EIGEN_DONT_VECTORIZE
#endif
#endif #endif

View File

@ -38,16 +38,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_BASE_H #ifndef GRID_BASE_H
#define GRID_BASE_H #define GRID_BASE_H
#include <Grid/GridStd.h>
#include <Grid/DisableWarnings.h>
#include <Grid/Namespace.h>
#include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h> #include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h> #include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h> #include <Grid/threads/Threads.h>
#include <Grid/util/Util.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h> #include <Grid/cartesian/Cartesian.h>
@ -57,5 +60,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/stencil/Stencil.h> #include <Grid/stencil/Stencil.h>
#include <Grid/parallelIO/BinaryIO.h> #include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h> #include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore)
#endif #endif

View File

@ -38,5 +38,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);
#endif #endif

View File

@ -7,6 +7,7 @@
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <vector> #include <vector>
#include <array>
#include <string> #include <string>
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>

View File

@ -1,14 +1,41 @@
#include <Grid/GridCore.h>
#pragma once #pragma once
// Force Eigen to use MKL if Grid has been configured with --enable-mkl // Force Eigen to use MKL if Grid has been configured with --enable-mkl
#ifdef USE_MKL #ifdef USE_MKL
#define EIGEN_USE_MKL_ALL #define EIGEN_USE_MKL_ALL
#endif #endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif #endif
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#pragma diag_suppress code_is_unreachable
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __NVCC__
#undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__
#endif
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
/* NVCC restore */
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

1
Grid/Grid_Eigen_Tensor.h Normal file
View File

@ -0,0 +1 @@
#include <Grid/Grid_Eigen_Dense.h>

38
Grid/Namespace.h Normal file
View File

@ -0,0 +1,38 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Namespace.h
Copyright (C) 2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <type_traits>
#include <cassert>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );

View File

@ -32,7 +32,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H
namespace Grid { NAMESPACE_BEGIN(Grid);
class Geometry { class Geometry {
// int dimension; // int dimension;
@ -104,7 +104,7 @@ namespace Grid {
GridBase *FineGrid; GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace; std::vector<Lattice<Fobj> > subspace;
int checkerboard; int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid), CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid), FineGrid(_FineGrid),
@ -127,10 +127,10 @@ namespace Grid {
CoarseVector eProj(CoarseGrid); CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace); blockProject(iProj,subspace[i],subspace);
eProj=zero; eProj=Zero();
parallel_for(int ss=0;ss<CoarseGrid->oSites();ss++){ thread_for(ss, CoarseGrid->oSites(),{
eProj._odata[ss](i)=CComplex(1.0); eProj[ss](i)=CComplex(1.0);
} });
eProj=eProj - iProj; eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl; std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
} }
@ -140,7 +140,7 @@ namespace Grid {
blockProject(CoarseVec,FineVec,subspace); blockProject(CoarseVec,FineVec,subspace);
} }
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){ void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.checkerboard = subspace[0].checkerboard; FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace); blockPromote(CoarseVec,FineVec,subspace);
} }
void CreateSubspaceRandom(GridParallelRNG &RNG){ void CreateSubspaceRandom(GridParallelRNG &RNG){
@ -211,7 +211,7 @@ namespace Grid {
for(int b=0;b<nn;b++){ for(int b=0;b<nn;b++){
subspace[b] = zero; subspace[b] = Zero();
gaussian(RNG,noise); gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5); scale = std::pow(norm2(noise),-0.5);
noise=noise*scale; noise=noise*scale;
@ -255,7 +255,8 @@ namespace Grid {
//////////////////// ////////////////////
Geometry geom; Geometry geom;
GridBase * _grid; GridBase * _grid;
CartesianStencil<siteVector,siteVector> Stencil;
CartesianStencil<siteVector,siteVector,int> Stencil;
std::vector<CoarseMatrix> A; std::vector<CoarseMatrix> A;
@ -267,14 +268,15 @@ namespace Grid {
RealD M (const CoarseVector &in, CoarseVector &out){ RealD M (const CoarseVector &in, CoarseVector &out){
conformable(_grid,in._grid); conformable(_grid,in.Grid());
conformable(in._grid,out._grid); conformable(in.Grid(),out.Grid());
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
auto in_v = in.View();
parallel_for(int ss=0;ss<Grid()->oSites();ss++){ auto out_v = in.View();
siteVector res = zero; thread_for(ss,Grid()->oSites(),{
siteVector res = Zero();
siteVector nbr; siteVector nbr;
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
@ -283,16 +285,17 @@ namespace Grid {
SE=Stencil.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) { if(SE->_is_local&&SE->_permute) {
permute(nbr,in._odata[SE->_offset],ptype); permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) { } else if(SE->_is_local) {
nbr = in._odata[SE->_offset]; nbr = in_v[SE->_offset];
} else { } else {
nbr = Stencil.CommBuf()[SE->_offset]; nbr = Stencil.CommBuf()[SE->_offset];
} }
res = res + A[point]._odata[ss]*nbr; auto A_point = A[point].View();
} res = res + A_point[ss]*nbr;
vstream(out._odata[ss],res);
} }
vstream(out_v[ss],res);
});
return norm2(out); return norm2(out);
}; };
@ -310,8 +313,8 @@ namespace Grid {
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){ void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
conformable(_grid,in._grid); conformable(_grid,in.Grid());
conformable(in._grid,out._grid); conformable(in.Grid(),out.Grid());
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
@ -323,8 +326,10 @@ namespace Grid {
return (4 * dir + 1 - disp) / 2; return (4 * dir + 1 - disp) / 2;
}(); }();
parallel_for(int ss=0;ss<Grid()->oSites();ss++){ auto out_v = out.View();
siteVector res = zero; auto in_v = in.View();
thread_for(ss,Grid()->oSites(),{
siteVector res = Zero();
siteVector nbr; siteVector nbr;
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
@ -332,28 +337,30 @@ namespace Grid {
SE=Stencil.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) { if(SE->_is_local&&SE->_permute) {
permute(nbr,in._odata[SE->_offset],ptype); permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) { } else if(SE->_is_local) {
nbr = in._odata[SE->_offset]; nbr = in_v[SE->_offset];
} else { } else {
nbr = Stencil.CommBuf()[SE->_offset]; nbr = Stencil.CommBuf()[SE->_offset];
} }
res = res + A[point]._odata[ss]*nbr; auto A_point = A[point].View();
res = res + A_point[ss]*nbr;
vstream(out._odata[ss],res); vstream(out_v[ss],res);
} });
}; };
void Mdiag(const CoarseVector &in, CoarseVector &out){ void Mdiag(const CoarseVector &in, CoarseVector &out){
Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
}; };
CoarsenedMatrix(GridCartesian &CoarseGrid) : CoarsenedMatrix(GridCartesian &CoarseGrid) :
_grid(&CoarseGrid), _grid(&CoarseGrid),
geom(CoarseGrid._ndimension), geom(CoarseGrid._ndimension),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid) A(geom.npoint,&CoarseGrid)
{ {
}; };
@ -366,7 +373,7 @@ namespace Grid {
FineField phi(FineGrid); FineField phi(FineGrid);
FineField tmp(FineGrid); FineField tmp(FineGrid);
FineField zz(FineGrid); zz=zero; FineField zz(FineGrid); zz=Zero();
FineField Mphi(FineGrid); FineField Mphi(FineGrid);
Lattice<iScalar<vInteger> > coor(FineGrid); Lattice<iScalar<vInteger> > coor(FineGrid);
@ -382,7 +389,7 @@ namespace Grid {
// set of vectors. // set of vectors.
int self_stencil=-1; int self_stencil=-1;
for(int p=0;p<geom.npoint;p++){ for(int p=0;p<geom.npoint;p++){
A[p]=zero; A[p]=Zero();
if( geom.displacements[p]==0){ if( geom.displacements[p]==0){
self_stencil=p; self_stencil=p;
} }
@ -415,7 +422,7 @@ namespace Grid {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
if ( disp==0 ) { if ( disp==0 ) {
iblock = Mphi; iblock = Mphi;
oblock = zero; oblock = Zero();
} else if ( disp==1 ) { } else if ( disp==1 ) {
oblock = where(mod(coor,block)==(block-1),Mphi,zz); oblock = where(mod(coor,block)==(block-1),Mphi,zz);
iblock = where(mod(coor,block)!=(block-1),Mphi,zz); iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
@ -430,14 +437,18 @@ namespace Grid {
Subspace.ProjectToSubspace(oProj,oblock); Subspace.ProjectToSubspace(oProj,oblock);
// blockProject(iProj,iblock,Subspace.subspace); // blockProject(iProj,iblock,Subspace.subspace);
// blockProject(oProj,oblock,Subspace.subspace); // blockProject(oProj,oblock,Subspace.subspace);
parallel_for(int ss=0;ss<Grid()->oSites();ss++){ auto iProj_v = iProj.View() ;
auto oProj_v = oProj.View() ;
auto A_p = A[p].View();
auto A_self = A[self_stencil].View();
thread_for(ss, Grid()->oSites(),{
for(int j=0;j<nbasis;j++){ for(int j=0;j<nbasis;j++){
if( disp!= 0 ) { if( disp!= 0 ) {
A[p]._odata[ss](j,i) = oProj._odata[ss](j); A_p[ss](j,i) = oProj_v[ss](j);
}
A[self_stencil]._odata[ss](j,i) = A[self_stencil]._odata[ss](j,i) + iProj._odata[ss](j);
} }
A_self[ss](j,i) = A_self[ss](j,i) + iProj_v[ss](j);
} }
});
} }
} }
@ -466,32 +477,7 @@ namespace Grid {
// AssertHermitian(); // AssertHermitian();
// ForceDiagonal(); // ForceDiagonal();
} }
void ForceDiagonal(void) {
std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
std::cout<<GridLogMessage<<"**** Forcing coarse operator to be diagonal ****"<<std::endl;
std::cout<<GridLogMessage<<"**************************************************"<<std::endl;
for(int p=0;p<8;p++){
A[p]=zero;
}
GridParallelRNG RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34}));
Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val);
Complex one(1.0);
iMatrix<CComplex,nbasis> ident; ident=one;
val = val*adj(val);
val = val + 1.0;
A[8] = val*ident;
// for(int s=0;s<Grid()->oSites();s++) {
// A[8]._odata[s]=val._odata[s];
// }
}
void ForceHermitian(void) { void ForceHermitian(void) {
for(int d=0;d<4;d++){ for(int d=0;d<4;d++){
int dd=d+1; int dd=d+1;
@ -522,5 +508,5 @@ namespace Grid {
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
@ -115,9 +115,9 @@ namespace Grid {
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
std::vector<int> dimensions; Coordinate dimensions;
std::vector<int> processors; Coordinate processors;
std::vector<int> processor_coor; Coordinate processor_coor;
public: public:
@ -137,7 +137,7 @@ namespace Grid {
{ {
flops=0; flops=0;
usec =0; usec =0;
std::vector<int> layout(Nd,1); Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors); sgrid = new GridCartesian(dimensions,layout,processors);
}; };
@ -146,10 +146,10 @@ namespace Grid {
} }
template<class vobj> template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){ void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result._grid,vgrid); conformable(result.Grid(),vgrid);
conformable(source._grid,vgrid); conformable(source.Grid(),vgrid);
Lattice<vobj> tmp(vgrid); Lattice<vobj> tmp(vgrid);
tmp = source; tmp = source;
for(int d=0;d<Nd;d++){ for(int d=0;d<Nd;d++){
@ -162,7 +162,7 @@ namespace Grid {
template<class vobj> template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
std::vector<int> mask(Nd,1); Coordinate mask(Nd,1);
FFT_dim_mask(result,source,mask,sign); FFT_dim_mask(result,source,mask,sign);
} }
@ -172,14 +172,14 @@ namespace Grid {
#ifndef HAVE_FFTW #ifndef HAVE_FFTW
assert(0); assert(0);
#else #else
conformable(result._grid,vgrid); conformable(result.Grid(),vgrid);
conformable(source._grid,vgrid); conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim]; int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim]; int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1); Coordinate layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions); Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim]; pencil_gd[dim] = G*processors[dim];
@ -191,7 +191,7 @@ namespace Grid {
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View();
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -217,8 +217,8 @@ namespace Grid {
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[0]; FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany, p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed, in,inembed,
istride,idist, istride,idist,
@ -228,26 +228,20 @@ namespace Grid {
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd); Coordinate lcoor(Nd), gcoor(Nd);
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
PARALLEL_REGION thread_for(idx, sgrid->lSites(),{
{ Coordinate cbuf(Nd);
std::vector<int> cbuf(Nd);
sobj s; sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf); sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf); peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L; cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L; // cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf); pokeLocalSite(s,pgbuf,cbuf);
} });
} if (p != processors[dim] - 1) {
if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L); result = Cshift(result,dim,L);
} }
} }
@ -256,20 +250,15 @@ namespace Grid {
int NN=pencil_g.lSites(); int NN=pencil_g.lSites();
GridStopWatch timer; GridStopWatch timer;
timer.Start(); timer.Start();
PARALLEL_REGION thread_for( idx,NN,{
{ Coordinate cbuf(Nd);
std::vector<int> cbuf(Nd);
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx, cbuf); pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf._odata[idx]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf._odata[idx]; FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out); FFTW<scalar>::fftw_execute_dft(p,in,out);
} }
} });
}
timer.Stop(); timer.Stop();
// performance counting // performance counting
@ -280,20 +269,15 @@ namespace Grid {
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
PARALLEL_REGION thread_for(idx,sgrid->lSites(),{
{ Coordinate clbuf(Nd), cgbuf(Nd);
std::vector<int> clbuf(Nd), cgbuf(Nd);
sobj s; sobj s;
PARALLEL_FOR_LOOP_INTERN
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,clbuf); sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result,clbuf);
} });
}
result = result*div; result = result*div;
// destroying plan // destroying plan
@ -301,6 +285,7 @@ namespace Grid {
#endif #endif
} }
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -26,16 +26,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALGORITHM_LINEAR_OP_H #pragma once
#define GRID_ALGORITHM_LINEAR_OP_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// LinearOperators Take a something and return a something. // LinearOperators Take a something and return a something.
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// //
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian conjugateugate (transpose if real): // Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian Conjugateugate (transpose if real):
//SBase //SBase
// i) F(a x + b y) = aF(x) + b F(y). // i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast // ii) <x|Op|y> = <y|AdjOp|x>^\ast
@ -183,13 +182,13 @@ namespace Grid {
virtual RealD Mpc (const Field &in, Field &out) =0; virtual RealD Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0; virtual RealD MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp(in._grid); Field tmp(in.Grid());
tmp.checkerboard = in.checkerboard; tmp.Checkerboard() = in.Checkerboard();
ni=Mpc(in,tmp); ni=Mpc(in,tmp);
no=MpcDag(tmp,out); no=MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.checkerboard = in.checkerboard; out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out,n1,n2); MpcDagMpc(in,out,n1,n2);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
@ -216,20 +215,20 @@ namespace Grid {
Matrix &_Mat; Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); Field tmp(in.Grid());
tmp.checkerboard = !in.checkerboard; tmp.Checkerboard() = !in.Checkerboard();
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl; //std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp); _Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out); _Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.checkerboard << " cb out " << out.checkerboard << std::endl; //std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
_Mat.Mooee(in,out); _Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out); return axpy_norm(out,-1.0,tmp,out);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp); _Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out); _Mat.MooeeInvDag(tmp,out);
@ -247,7 +246,7 @@ namespace Grid {
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp); _Mat.MooeeInv(out,tmp);
@ -257,7 +256,7 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in); return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out); _Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp); _Mat.MeooeDag(out,tmp);
@ -275,7 +274,7 @@ namespace Grid {
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MooeeInv(in,out); _Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
@ -285,7 +284,7 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in); return axpy_norm(out,-1.0,tmp,in);
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid); Field tmp(in.Grid());
_Mat.MeooeDag(in,out); _Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp); _Mat.MooeeInvDag(out,tmp);
@ -353,7 +352,17 @@ namespace Grid {
axpby(out,-1.0,mass*mass,tmp,in); axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond(); taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) { virtual RealD Mpc (const Field &in, Field &out)
{
Field tmp(in.Grid());
Field tmp2(in.Grid());
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond(); tMeo-=usecond();
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
@ -464,13 +473,15 @@ namespace Grid {
private: private:
std::vector<RealD> Coeffs; std::vector<RealD> Coeffs;
public: public:
using OperatorFunction<Field>::operator();
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { }; Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface // Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in._grid); Field AtoN(in.Grid());
Field Mtmp(in._grid); Field Mtmp(in.Grid());
AtoN = in; AtoN = in;
out = AtoN*Coeffs[0]; out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){ for(int n=1;n<Coeffs.size();n++){
@ -481,6 +492,4 @@ namespace Grid {
}; };
}; };
} NAMESPACE_END(Grid);
#endif

View File

@ -28,7 +28,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
#ifndef GRID_PRECONDITIONER_H #ifndef GRID_PRECONDITIONER_H
#define GRID_PRECONDITIONER_H #define GRID_PRECONDITIONER_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class Field> class Preconditioner : public LinearFunction<Field> { template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0; virtual void operator()(const Field &src, Field & psi)=0;
@ -42,5 +42,5 @@ namespace Grid {
TrivialPrecon(void){}; TrivialPrecon(void){};
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#define GRID_ALGORITHM_SPARSE_MATRIX_H #define GRID_ALGORITHM_SPARSE_MATRIX_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action // Interface defining what I expect of a general sparse matrix, such as a Fermion action
@ -41,7 +41,7 @@ namespace Grid {
virtual RealD M (const Field &in, Field &out)=0; virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0; virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in._grid); Field tmp (in.Grid());
ni=M(in,tmp); ni=M(in,tmp);
no=Mdag(tmp,out); no=Mdag(tmp,out);
} }
@ -74,6 +74,6 @@ namespace Grid {
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -32,7 +32,7 @@ Author: Christoph Lehner <clehner@bnl.gov>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
struct ChebyParams : Serializable { struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams, GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
@ -47,6 +47,8 @@ struct ChebyParams : Serializable {
template<class Field> template<class Field>
class Chebyshev : public OperatorFunction<Field> { class Chebyshev : public OperatorFunction<Field> {
private: private:
using OperatorFunction<Field>::operator();
std::vector<RealD> Coeffs; std::vector<RealD> Coeffs;
int order; int order;
RealD hi; RealD hi;
@ -55,7 +57,7 @@ struct ChebyParams : Serializable {
public: public:
void csv(std::ostream &out){ void csv(std::ostream &out){
RealD diff = hi-lo; RealD diff = hi-lo;
RealD delta = (hi-lo)*1.0e-9; RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) { for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1; delta*=1.1;
RealD f = approx(x); RealD f = approx(x);
@ -212,9 +214,9 @@ struct ChebyParams : Serializable {
// Implement the required interface // Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid; GridBase *grid=in.Grid();
// std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl; // std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl; //std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites(); int vol=grid->gSites();
@ -321,7 +323,7 @@ struct ChebyParams : Serializable {
// shift_Multiply in Rudy's code // shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out) void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{ {
GridBase *grid=in._grid; GridBase *grid=in.Grid();
Field tmp(grid); Field tmp(grid);
RealD aa= alpha*alpha; RealD aa= alpha*alpha;
@ -338,7 +340,7 @@ struct ChebyParams : Serializable {
// Implement the required interface // Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) { void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid; GridBase *grid=in.Grid();
int vol=grid->gSites(); int vol=grid->gSites();
@ -373,5 +375,5 @@ struct ChebyParams : Serializable {
} }
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -31,7 +31,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
#ifndef INCLUDED_FORECAST_H #ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H #define INCLUDED_FORECAST_H
namespace Grid { NAMESPACE_BEGIN(Grid);
// Abstract base class. // Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi) // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
@ -57,10 +57,10 @@ namespace Grid {
Field chi(phi); // forecasted solution Field chi(phi); // forecasted solution
// Trivial cases // Trivial cases
if(degree == 0){ chi = zero; return chi; } if(degree == 0){ chi = Zero(); return chi; }
else if(degree == 1){ return prev_solns[0]; } else if(degree == 1){ return prev_solns[0]; }
RealD dot; // RealD dot;
ComplexD xp; ComplexD xp;
Field r(phi); // residual Field r(phi); // residual
Field Mv(phi); Field Mv(phi);
@ -92,7 +92,7 @@ namespace Grid {
for(int j=0; j<degree; j++){ for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){ for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]); G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = std::conj(G[j][k]); G[k][j] = conjugate(G[j][k]);
}} }}
// Gauss-Jordan elimination with partial pivoting // Gauss-Jordan elimination with partial pivoting
@ -100,7 +100,7 @@ namespace Grid {
// Perform partial pivoting // Perform partial pivoting
int k = i; int k = i;
for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } } for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } }
if(k != i){ if(k != i){
xp = b[k]; xp = b[k];
b[k] = b[i]; b[k] = b[i];
@ -121,7 +121,7 @@ namespace Grid {
} }
// Use Gaussian elimination to solve equations and calculate initial guess // Use Gaussian elimination to solve equations and calculate initial guess
chi = zero; chi = Zero();
r = phi; r = phi;
for(int i=degree-1; i>=0; i--){ for(int i=degree-1; i>=0; i--){
a[i] = 0.0; a[i] = 0.0;
@ -136,7 +136,7 @@ namespace Grid {
for(int i=0; i<degree; i++){ for(int i=0; i<degree; i++){
tmp = -b[i]; tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; } for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = std::conj(tmp)*tmp; tmp = conjugate(tmp)*tmp;
true_r += std::sqrt(tmp.real()); true_r += std::sqrt(tmp.real());
} }
@ -147,6 +147,6 @@ namespace Grid {
}; };
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -27,7 +27,8 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
double MultiShiftFunction::approx(double x) double MultiShiftFunction::approx(double x)
{ {
double a = norm; double a = norm;
@ -53,4 +54,4 @@ void MultiShiftFunction::csv(std::ostream &out)
} }
return; return;
} }
} NAMESPACE_END(Grid);

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef MULTI_SHIFT_FUNCTION #ifndef MULTI_SHIFT_FUNCTION
#define MULTI_SHIFT_FUNCTION #define MULTI_SHIFT_FUNCTION
namespace Grid { NAMESPACE_BEGIN(Grid);
class MultiShiftFunction { class MultiShiftFunction {
public: public:
@ -63,5 +63,5 @@ public:
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) {
// Search for error maxima and minima // Search for error maxima and minima
void AlgRemez::search(bigfloat *step) { void AlgRemez::search(bigfloat *step) {
bigfloat a, q, xm, ym, xn, yn, xx0, xx1; bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
int i, j, meq, emsign, ensign, steps; int i, meq, emsign, ensign, steps;
meq = neq + 1; meq = neq + 1;
bigfloat *yy = new bigfloat[meq]; bigfloat *yy = new bigfloat[meq];
@ -306,7 +306,6 @@ void AlgRemez::search(bigfloat *step) {
bigfloat eclose = 1.0e30; bigfloat eclose = 1.0e30;
bigfloat farther = 0l; bigfloat farther = 0l;
j = 1;
xx0 = apstrt; xx0 = apstrt;
for (i = 0; i < meq; i++) { for (i = 0; i < meq; i++) {

View File

@ -58,8 +58,8 @@
/* Compute the partial fraction expansion coefficients (alpha) from the /* Compute the partial fraction expansion coefficients (alpha) from the
* factored form */ * factored form */
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace Approx { NAMESPACE_BEGIN(Approx);
static void construct_partfrac(izd *z) { static void construct_partfrac(izd *z) {
int dn = z -> dn, dd = z -> dd, type = z -> type; int dn = z -> dn, dd = z -> dd, type = z -> type;
@ -516,7 +516,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
free(d); free(d);
return zd; return zd;
} }
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#ifdef TEST #ifdef TEST
@ -585,6 +587,7 @@ static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
return (ONE - T) / (ONE + T); return (ONE - T) / (ONE + T);
} }
/* Test program. Apart from printing out the parameters for R(x) it produces /* Test program. Apart from printing out the parameters for R(x) it produces
* the following data files for plotting (unless NPLOT is defined): * the following data files for plotting (unless NPLOT is defined):
* *
@ -723,5 +726,5 @@ int main(int argc, char** argv) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
#endif /* TEST */ #endif /* TEST */

View File

@ -1,13 +1,13 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */ /* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
#ifdef __cplusplus #ifdef __cplusplus
namespace Grid { #include <Grid/Namespace.h>
namespace Approx { NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
#endif #endif
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL #ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION #ifndef PRECISION
#define PRECISION double #define PRECISION double
@ -83,5 +83,6 @@ void zolotarev_free(zolotarev_data *zdata);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
}} NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif #endif

View File

@ -10,10 +10,12 @@
#ifndef INCLUDED_BIGFLOAT_H #ifndef INCLUDED_BIGFLOAT_H
#define INCLUDED_BIGFLOAT_H #define INCLUDED_BIGFLOAT_H
#define __GMP_WITHIN_CONFIGURE
#include <gmp.h> #include <gmp.h>
#include <mpf2mpfr.h> #include <mpf2mpfr.h>
#include <mpfr.h> #include <mpfr.h>
#undef __GMP_WITHIN_CONFIGURE
class bigfloat { class bigfloat {
private: private:

View File

@ -90,8 +90,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
grid = src._grid; grid = src.Grid();
RealD f; RealD f;
RealD rtzp,rtz,a,d,b; RealD rtzp,rtz,a,d,b;

View File

@ -27,11 +27,9 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H #pragma once
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
NAMESPACE_BEGIN(Grid);
namespace Grid {
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
@ -154,12 +152,12 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{ {
int Orthog = blockDim; // First dimension is block dim; this is an assumption int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B._grid->_fdimensions[Orthog]; Nblock = B.Grid()->_fdimensions[Orthog];
/* FAKE */ /* FAKE */
Nblock=8; Nblock=8;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.checkerboard = B.checkerboard; X.Checkerboard() = B.Checkerboard();
conformable(X, B); conformable(X, B);
Field tmp(B); Field tmp(B);
@ -334,11 +332,11 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi) void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{ {
int Orthog = blockDim; // First dimension is block dim int Orthog = blockDim; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog]; Nblock = Src.Grid()->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard; Psi.Checkerboard() = Src.Checkerboard();
conformable(Psi, Src); conformable(Psi, Src);
Field P(Src); Field P(Src);
@ -478,7 +476,7 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
tmp[b] = Y[b]; tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) { for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + (scale*m(bp,b))*X[bp]; tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
} }
} }
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
@ -488,9 +486,9 @@ void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){ void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for // Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
AP[b] = zero; AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) { for(int bp=0;bp<Nblock;bp++) {
AP[b] += (m(bp,b))*X[bp]; AP[b] += scomplex(m(bp,b))*X[bp];
} }
} }
} }
@ -517,7 +515,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
for(int b=0;b<Nblock;b++){ for(int b=0;b<Nblock;b++){
X[b].checkerboard = B[b].checkerboard; X[b].Checkerboard() = B[b].Checkerboard();
conformable(X[b], B[b]); conformable(X[b], B[b]);
conformable(X[b], X[0]); conformable(X[b], X[0]);
} }
@ -690,9 +688,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
} NAMESPACE_END(Grid);
#endif

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> { class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true // defaults to true
@ -52,10 +54,10 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
CommunicationAvoidingGeneralisedMinimalResidual(RealD tol, CommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit, Integer maxit,
@ -76,7 +78,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl; std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -86,7 +88,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
@ -142,11 +144,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart // this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -157,7 +159,9 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
gamma[0] = sqrt(norm2(r)); gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r; ComplexD scale = 1.0/gamma[0];
v[0] = scale * r;
LinalgTimer.Stop(); LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) { for (int i=0; i<RestartLength; i++) {
@ -168,7 +172,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -194,11 +198,11 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -206,13 +210,13 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -221,7 +225,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -231,8 +235,8 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -31,7 +31,7 @@ directory
#ifndef GRID_CONJUGATE_GRADIENT_H #ifndef GRID_CONJUGATE_GRADIENT_H
#define GRID_CONJUGATE_GRADIENT_H #define GRID_CONJUGATE_GRADIENT_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@ -41,6 +41,9 @@ namespace Grid {
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
@ -54,11 +57,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
psi.checkerboard = src.checkerboard;
conformable(psi, src); conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred; RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
Field p(src); Field p(src);
Field mmp(src); Field mmp(src);
@ -127,10 +131,13 @@ class ConjugateGradient : public OperatorFunction<Field> {
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
parallel_for(int ss=0;ss<src._grid->oSites();ss++){ auto psi_v = psi.View();
vstream(psi[ss], a * p[ss] + psi[ss]); auto p_v = p.View();
vstream(p [ss], b * p[ss] + r[ss]); auto r_v = r.View();
} accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@ -143,12 +150,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, qq); Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src; p = mmp - src;
RealD srcnorm = sqrt(norm2(src)); RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p)); RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl; std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
@ -174,5 +181,5 @@ class ConjugateGradient : public OperatorFunction<Field> {
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -28,8 +28,7 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H #ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H #define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
namespace Grid { NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG //Mixed precision restarted defect correction CG
template<class FieldD,class FieldF, template<class FieldD,class FieldF,
@ -73,18 +72,18 @@ namespace Grid {
GridStopWatch TotalTimer; GridStopWatch TotalTimer;
TotalTimer.Start(); TotalTimer.Start();
int cb = src_d_in.checkerboard; int cb = src_d_in.Checkerboard();
sol_d.checkerboard = cb; sol_d.Checkerboard() = cb;
RealD src_norm = norm2(src_d_in); RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance; RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in._grid; GridBase* DoublePrecGrid = src_d_in.Grid();
FieldD tmp_d(DoublePrecGrid); FieldD tmp_d(DoublePrecGrid);
tmp_d.checkerboard = cb; tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid); FieldD tmp2_d(DoublePrecGrid);
tmp2_d.checkerboard = cb; tmp2_d.Checkerboard() = cb;
FieldD src_d(DoublePrecGrid); FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation src_d = src_d_in; //source for next inner iteration, computed from residual during operation
@ -92,10 +91,10 @@ namespace Grid {
RealD inner_tol = InnerTolerance; RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid); FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb; src_f.Checkerboard() = cb;
FieldF sol_f(SinglePrecGrid); FieldF sol_f(SinglePrecGrid);
sol_f.checkerboard = cb; sol_f.Checkerboard() = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false; CG_f.ErrorOnNoConverge = false;
@ -123,7 +122,7 @@ namespace Grid {
precisionChange(src_f, src_d); precisionChange(src_f, src_d);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
zeroit(sol_f); sol_f = Zero();
//Optionally improve inner solver guess (eg using known eigenvectors) //Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) if(guesser != NULL)
@ -157,8 +156,6 @@ namespace Grid {
} }
}; };
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H #ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H #define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@ -41,6 +41,9 @@ namespace Grid {
public OperatorFunction<Field> public OperatorFunction<Field>
{ {
public: public:
using OperatorFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
@ -56,7 +59,7 @@ public:
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{ {
GridBase *grid = src._grid; GridBase *grid = src.Grid();
int nshift = shifts.order; int nshift = shifts.order;
std::vector<Field> results(nshift,grid); std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi); (*this)(Linop,src,results,psi);
@ -78,7 +81,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi) void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{ {
GridBase *grid = src._grid; GridBase *grid = src.Grid();
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction" // Convenience references to the info stored in "MultiShiftFunction"
@ -318,5 +321,5 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -28,9 +28,11 @@ Author: Christopher Kelly <ckelly@phys.columbia.edu>
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H #ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H #define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public: public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
@ -74,7 +76,7 @@ namespace Grid {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f; LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false; bool using_fallback = false;
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred; RealD cp, c, a, d, b, ssq, qq, b_pred;
@ -108,17 +110,17 @@ namespace Grid {
// Check if guess is really REALLY good :) // Check if guess is really REALLY good :)
if (cp <= rsq) { if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n"; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
return; return;
} }
//Single prec initialization //Single prec initialization
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
r_f.checkerboard = r.checkerboard; r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r); precisionChange(r_f, r);
FieldF psi_f(r_f); FieldF psi_f(r_f);
psi_f = zero; psi_f = Zero();
FieldF p_f(r_f); FieldF p_f(r_f);
FieldF mmp_f(r_f); FieldF mmp_f(r_f);
@ -178,12 +180,12 @@ namespace Grid {
Linop_d.HermOpAndNorm(psi, mmp, d, qq); Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src; p = mmp - src;
RealD srcnorm = sqrt(norm2(src)); RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p)); RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl; std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl; std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl; std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
@ -217,7 +219,7 @@ namespace Grid {
Linop_d.HermOpAndNorm(psi, mmp, d, qq); Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp; r = src - mmp;
psi_f = zero; psi_f = Zero();
precisionChange(r_f, r); precisionChange(r_f, r);
cp = norm2(r); cp = norm2(r);
MaxResidSinceLastRelUp = cp; MaxResidSinceLastRelUp = cp;
@ -249,7 +251,7 @@ namespace Grid {
}; };
}; NAMESPACE_END(Grid);

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CONJUGATE_RESIDUAL_H #ifndef GRID_CONJUGATE_RESIDUAL_H
#define GRID_CONJUGATE_RESIDUAL_H #define GRID_CONJUGATE_RESIDUAL_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@ -39,6 +39,8 @@ namespace Grid {
template<class Field> template<class Field>
class ConjugateResidual : public OperatorFunction<Field> { class ConjugateResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;
@ -49,14 +51,14 @@ namespace Grid {
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b, c, d; RealD a, b; // c, d;
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp; RealD rAr, rAAr, rArp;
RealD pAp, pAAp; RealD pAp, pAAp;
GridBase *grid = src._grid; GridBase *grid = src.Grid();
psi=zero; psi=Zero();
Field r(grid), p(grid), Ap(grid), Ar(grid); Field r(grid), p(grid), Ap(grid), Ar(grid);
r=src; r=src;
@ -95,8 +97,8 @@ namespace Grid {
axpy(r,-1.0,src,Ap); axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq; RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq) << " computed residual "<<std::sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid) << " true residual "<<std::sqrt(true_resid)
<< " target " <<Tolerance <<std::endl; << " target " <<Tolerance <<std::endl;
return; return;
} }
@ -107,5 +109,5 @@ namespace Grid {
assert(0); assert(0);
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -33,7 +33,7 @@ namespace Grid {
template<class Field> template<class Field>
class ZeroGuesser: public LinearFunction<Field> { class ZeroGuesser: public LinearFunction<Field> {
public: public:
virtual void operator()(const Field &src, Field &guess) { guess = zero; }; virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
}; };
template<class Field> template<class Field>
class DoNothingGuesser: public LinearFunction<Field> { class DoNothingGuesser: public LinearFunction<Field> {
@ -60,14 +60,14 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
guess = zero; guess = Zero();
assert(evec.size()==eval.size()); assert(evec.size()==eval.size());
auto N = evec.size(); auto N = evec.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const Field& tmp = evec[i]; const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
} }
guess.checkerboard = src.checkerboard; guess.Checkerboard() = src.Checkerboard();
} }
}; };
@ -90,15 +90,15 @@ public:
void operator()(const FineField &src,FineField &guess) { void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size(); int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid); CoarseField src_coarse(evec_coarse[0].Grid());
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero; CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero();
blockProject(src_coarse,src,subspace); blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i]; const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse); axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
} }
blockPromote(guess_coarse,guess,subspace); blockPromote(guess_coarse,guess,subspace);
guess.checkerboard = src.checkerboard; guess.Checkerboard() = src.Checkerboard();
}; };
}; };

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> { class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true // defaults to true
@ -53,10 +55,10 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner; LinearFunction<Field> &Preconditioner;
@ -81,7 +83,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl; std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -91,7 +93,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
@ -149,12 +151,12 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart // these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -176,7 +178,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -206,11 +208,11 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -218,13 +220,13 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -233,7 +235,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -243,8 +245,8 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> { class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true // defaults to true
@ -53,10 +55,10 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner; LinearFunction<Field> &Preconditioner;
@ -79,7 +81,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -89,7 +91,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
@ -147,12 +149,12 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart // these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -174,7 +176,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -204,11 +206,11 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -216,13 +218,13 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -231,7 +233,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -241,8 +243,8 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -34,6 +34,8 @@ namespace Grid {
template<class Field> template<class Field>
class GeneralisedMinimalResidual : public OperatorFunction<Field> { class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true // defaults to true
@ -52,10 +54,10 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
GeneralisedMinimalResidual(RealD tol, GeneralisedMinimalResidual(RealD tol,
Integer maxit, Integer maxit,
@ -74,7 +76,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -84,7 +86,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
Field r(src._grid); Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
@ -140,11 +142,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
RealD cp = 0; RealD cp = 0;
Field w(src._grid); Field w(src.Grid());
Field r(src._grid); Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart // this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -166,7 +168,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -192,11 +194,11 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -204,13 +206,13 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -219,7 +221,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -229,8 +231,8 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -35,7 +35,7 @@ Author: Christoph Lehner <clehner@bnl.gov>
//#include <zlib.h> //#include <zlib.h>
#include <sys/stat.h> #include <sys/stat.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h // Move following 100 LOC to lattice/Lattice_basis.h
@ -52,26 +52,31 @@ void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
template<class Field> template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm) void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
std::vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid; GridBase* grid = basis[0].Grid();
parallel_region for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
thread_region
{ {
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.; for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){ for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis[k]._odata[ss]; B[j] +=Qt(j,k) * basis_v[k][ss];
} }
} }
for(int j=j0; j<j1; ++j){ for(int j=j0; j<j1; ++j){
basis[j]._odata[ss] = B[j]; basis_v[j][ss] = B[j];
}
} }
});
} }
} }
@ -80,16 +85,18 @@ template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{ {
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid; GridBase* grid = basis[0].Grid();
result.checkerboard = basis[0].checkerboard; result.Checkerboard() = basis[0].Checkerboard();
parallel_for(int ss=0;ss < grid->oSites();ss++){ auto result_v=result.View();
vobj B = zero; thread_for(ss, grid->oSites(),{
vobj B = Zero();
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B +=Qt(j,k) * basis[k]._odata[ss]; auto basis_k = basis[k].View();
} B +=Qt(j,k) * basis_k[ss];
result._odata[ss] = B;
} }
result_v[ss] = B;
});
} }
template<class Field> template<class Field>
@ -119,7 +126,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i); assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]); std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i]; idx[j] = idx[i];
@ -150,6 +157,19 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
basisReorderInPlace(_v,sort_vals,idx); basisReorderInPlace(_v,sort_vals,idx);
} }
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted lanczos // Implicitly restarted lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@ -289,7 +309,7 @@ public:
template<typename T> static RealD normalise(T& v) template<typename T> static RealD normalise(T& v)
{ {
RealD nn = norm2(v); RealD nn = norm2(v);
nn = sqrt(nn); nn = std::sqrt(nn);
v = v * (1.0/nn); v = v * (1.0/nn);
return nn; return nn;
} }
@ -321,8 +341,8 @@ until convergence
*/ */
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false) void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{ {
GridBase *grid = src._grid; GridBase *grid = src.Grid();
assert(grid == evec[0]._grid); assert(grid == evec[0].Grid());
GridLogIRL.TimingMode(1); GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@ -446,7 +466,7 @@ until convergence
assert(k2<Nm); assert(k2<Nm); assert(k1>0); assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl; std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Compressed vector f and beta(k2) // Compressed vector f and beta(k2)
@ -454,7 +474,7 @@ until convergence
f *= Qt(k2-1,Nm-1); f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2]; f += lme[k2-1] * evec[k2];
beta_k = norm2(f); beta_k = norm2(f);
beta_k = sqrt(beta_k); beta_k = std::sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl; std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k; RealD betar = 1.0/beta_k;
@ -477,7 +497,7 @@ until convergence
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl; std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.checkerboard = evec[0].checkerboard; Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
// power of two search pattern; not every evalue in eval2 is assessed. // power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1; int allconv =1;
@ -515,7 +535,7 @@ until convergence
converged: converged:
{ {
Field B(grid); B.checkerboard = evec[0].checkerboard; Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
basisRotate(evec,Qt,0,Nk,0,Nk,Nm); basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl; std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0; Nconv=0;
@ -807,7 +827,7 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
// determination of 2x2 leading submatrix // determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2]; RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]); RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub))); RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift) // (Dsh: shift)
@ -838,5 +858,6 @@ void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
abort(); abort();
} }
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LOCAL_COHERENCE_IRL_H #ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H #define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid { NAMESPACE_BEGIN(Grid);
struct LanczosParams : Serializable { struct LanczosParams : Serializable {
public: public:
@ -59,7 +58,7 @@ struct LocalCoherenceLanczosParams : Serializable {
RealD , coarse_relax_tol, RealD , coarse_relax_tol,
std::vector<int>, blockSize, std::vector<int>, blockSize,
std::string, config, std::string, config,
std::vector < std::complex<double> >, omega, std::vector < ComplexD >, omega,
RealD, mass, RealD, mass,
RealD, M5); RealD, M5);
}; };
@ -83,11 +82,11 @@ public:
}; };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid; GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].checkerboard; int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.checkerboard= checkerboard; FineField fin (FineGrid); fin.Checkerboard()= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard; FineField fout(FineGrid); fout.Checkerboard() = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl; blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl; _Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
@ -118,11 +117,11 @@ public:
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid; GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].checkerboard; int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.checkerboard =checkerboard; FineField fin (FineGrid); fin.Checkerboard() =checkerboard;
FineField fout(FineGrid);fout.checkerboard =checkerboard; FineField fout(FineGrid);fout.Checkerboard() =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl; blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl; _poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
@ -182,10 +181,10 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
} }
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
GridBase *FineGrid = _subspace[0]._grid; GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].checkerboard; int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.checkerboard =checkerboard; FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard; FineField fv(FineGrid);fv.Checkerboard() =checkerboard;
blockPromote(B,fv,_subspace); blockPromote(B,fv,_subspace);
@ -305,11 +304,11 @@ public:
int Nk = nbasis; int Nk = nbasis;
subspace.resize(Nk,_FineGrid); subspace.resize(Nk,_FineGrid);
subspace[0]=1.0; subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard; subspace[0].Checkerboard()=_checkerboard;
normalise(subspace[0]); normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){ for(int k=1;k<Nk;k++){
subspace[k].checkerboard=_checkerboard; subspace[k].Checkerboard()=_checkerboard;
Op(subspace[k-1],subspace[k]); Op(subspace[k-1],subspace[k]);
normalise(subspace[k]); normalise(subspace[k]);
} }
@ -360,7 +359,11 @@ public:
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; FineField src(_FineGrid);
typedef typename FineField::scalar_type Scalar;
// src=1.0;
src=Scalar(1.0);
src.Checkerboard() = _checkerboard;
int Nconv; int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false); IRL.calc(evals_fine,subspace,src,Nconv,false);
@ -402,5 +405,5 @@ public:
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -33,6 +33,8 @@ namespace Grid {
template<class Field> class MinimalResidual : public OperatorFunction<Field> { template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge. bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
@ -46,11 +48,11 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
Complex a, c; ComplexD a, c;
Real d; RealD d;
Field Mr(src); Field Mr(src);
Field r(src); Field r(src);
@ -71,7 +73,6 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl; std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl; std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl;
std::cout << GridLogIterative << "MinimalResidual: mp " << d << std::endl;
std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl; std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl;
if (cp <= rsq) { if (cp <= rsq) {

View File

@ -34,6 +34,9 @@ namespace Grid {
template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> { class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
public: public:
using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true // defaults to true
@ -54,10 +57,10 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
Eigen::MatrixXcd H; Eigen::MatrixXcd H;
std::vector<std::complex<double>> y; std::vector<ComplexD> y;
std::vector<std::complex<double>> gamma; std::vector<ComplexD> gamma;
std::vector<std::complex<double>> c; std::vector<ComplexD> c;
std::vector<std::complex<double>> s; std::vector<ComplexD> s;
GridBase* SinglePrecGrid; GridBase* SinglePrecGrid;
@ -84,7 +87,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) { void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
psi.checkerboard = src.checkerboard; psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
@ -94,7 +97,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
FieldD r(src._grid); FieldD r(src.Grid());
std::cout << std::setprecision(4) << std::scientific; std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl; std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
@ -154,12 +157,12 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
RealD cp = 0; RealD cp = 0;
FieldD w(src._grid); FieldD w(src.Grid());
FieldD r(src._grid); FieldD r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart // these should probably be made class members so that they are only allocated once, not in every restart
std::vector<FieldD> v(RestartLength + 1, src._grid); for (auto &elem : v) elem = zero; std::vector<FieldD> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<FieldD> z(RestartLength + 1, src._grid); for (auto &elem : z) elem = zero; std::vector<FieldD> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start(); MatrixTimer.Start();
LinOp.Op(psi, w); LinOp.Op(psi, w);
@ -181,7 +184,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
qrUpdate(i); qrUpdate(i);
cp = std::norm(gamma[i+1]); cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl; << " residual " << cp << " target " << rsq << std::endl;
@ -223,11 +226,11 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
LinalgTimer.Start(); LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) { for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w); H(iter, i) = innerProduct(v[i], w);
w = w - H(iter, i) * v[i]; w = w - ComplexD(H(iter, i)) * v[i];
} }
H(iter, iter + 1) = sqrt(norm2(w)); H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = (1. / H(iter, iter + 1)) * w; v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
@ -235,13 +238,13 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
QrTimer.Start(); QrTimer.Start();
for (int i = 0; i < iter ; ++i) { for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * H(iter, i) + c[i] * H(iter, i + 1); auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = std::conj(c[i]) * H(iter, i) + std::conj(s[i]) * H(iter, i + 1); H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp; H(iter, i + 1) = tmp;
} }
// Compute new Givens Rotation // Compute new Givens Rotation
ComplexD nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1))); auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu; c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu; s[iter] = H(iter, iter + 1) / nu;
@ -250,7 +253,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
H(iter, iter + 1) = 0.; H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter]; gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = std::conj(c[iter]) * gamma[iter]; gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop(); QrTimer.Stop();
} }
@ -260,8 +263,8 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
for (int i = iter; i >= 0; i--) { for (int i = iter; i >= 0; i--) {
y[i] = gamma[i]; y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++) for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - H(k, i) * y[k]; y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / H(i, i); y[i] = y[i] / ComplexD(H(i, i));
} }
for (int i = 0; i <= iter; i++) for (int i = 0; i <= iter; i++)

View File

@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_NORMAL_EQUATIONS_H #ifndef GRID_NORMAL_EQUATIONS_H
#define GRID_NORMAL_EQUATIONS_H #define GRID_NORMAL_EQUATIONS_H
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver // Take a matrix and form an NE solver calling a Herm solver
@ -48,7 +48,7 @@ namespace Grid {
void operator() (const Field &in, Field &out){ void operator() (const Field &in, Field &out){
Field src(in._grid); Field src(in.Grid());
_Matrix.Mdag(in,src); _Matrix.Mdag(in,src);
_HermitianSolver(src,out); // Mdag M out = Mdag in _HermitianSolver(src,out); // Mdag M out = Mdag in
@ -56,5 +56,5 @@ namespace Grid {
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -14,7 +14,7 @@ template<class Field> class PowerMethod
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src) RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{ {
GridBase *grid = src._grid; GridBase *grid = src.Grid();
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;

View File

@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H #ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
#define GRID_PREC_CONJUGATE_RESIDUAL_H #define GRID_PREC_CONJUGATE_RESIDUAL_H
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators // Base classes for iterative processes based on operators
@ -56,7 +56,7 @@ namespace Grid {
RealD rAr, rAAr, rArp; RealD rAr, rAAr, rArp;
RealD pAp, pAAp; RealD pAp, pAAp;
GridBase *grid = src._grid; GridBase *grid = src.Grid();
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid); Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero; psi=zero;
@ -115,5 +115,5 @@ namespace Grid {
assert(0); assert(0);
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -36,11 +36,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
//NB. Likely not original reference since they are focussing on a preconditioner variant. //NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper // but VPGCR was nicely written up in their paper
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class Field> template<class Field>
class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> { class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;
@ -65,12 +67,12 @@ namespace Grid {
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){ void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
psi=zero; psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;
Field r(src._grid); Field r(src.Grid());
PrecTimer.Reset(); PrecTimer.Reset();
MatTimer.Reset(); MatTimer.Reset();
@ -113,11 +115,11 @@ namespace Grid {
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
RealD cp; RealD cp;
RealD a, b, c, d; RealD a, b;
RealD zAz, zAAz; RealD zAz, zAAz;
RealD rAq, rq; RealD rq;
GridBase *grid = src._grid; GridBase *grid = src.Grid();
Field r(grid); Field r(grid);
Field z(grid); Field z(grid);
@ -132,6 +134,7 @@ namespace Grid {
std::vector<Field> p(mmax,grid); std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax); std::vector<RealD> qq(mmax);
////////////////////////////////// //////////////////////////////////
// initial guess x0 is taken as nonzero. // initial guess x0 is taken as nonzero.
// r0=src-A x0 = src // r0=src-A x0 = src
@ -232,10 +235,9 @@ namespace Grid {
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
assert(0); // never reached assert(0); // never reached
return cp; return cp;
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -297,9 +297,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e) // src_o = (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
} }
@ -317,17 +317,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even); src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even); _Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix); SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@ -366,13 +366,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@ -386,17 +386,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.checkerboard ==Even); src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.checkerboard ==Even); _Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.checkerboard ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@ -437,12 +437,12 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@ -463,12 +463,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.checkerboard ==Even); _Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.checkerboard ==Even); tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.checkerboard ==Even); _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.checkerboard ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.checkerboard ==Odd ); setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)

View File

@ -1,11 +1,12 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <fcntl.h> #include <fcntl.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr; MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false; bool MemoryProfiler::debug = false;
#ifdef POINTER_CACHE
int PointerCache::victim; int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache]; PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
@ -51,7 +52,7 @@ void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL; if (bytes < 4096 ) return NULL;
#ifdef _OPENMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
@ -63,7 +64,7 @@ void *PointerCache::Lookup(size_t bytes) {
} }
return NULL; return NULL;
} }
#endif
void check_huge_pages(void *Buf,uint64_t BYTES) void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
@ -122,4 +123,5 @@ std::string sizeString(const size_t bytes)
return std::string(buf); return std::string(buf);
} }
} NAMESPACE_END(Grid);

View File

@ -40,8 +40,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <mm_malloc.h> #include <mm_malloc.h>
#endif #endif
namespace Grid { #define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#ifdef POINTER_CACHE
class PointerCache { class PointerCache {
private: private:
@ -63,6 +68,7 @@ namespace Grid {
static void *Lookup(size_t bytes) ; static void *Lookup(size_t bytes) ;
}; };
#endif
std::string sizeString(size_t bytes); std::string sizeString(size_t bytes);
@ -152,29 +158,45 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
////////////////// #ifdef POINTER_CACHE
// Hack 2MB align; could make option probably doesn't need configurability _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
////////////////// #else
//define GRID_ALLOC_ALIGN (128) pointer ptr = nullptr;
#define GRID_ALLOC_ALIGN (2*1024*1024) #endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else #else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif #endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl; assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop // First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr; //////////////////////////////////////////////////
#ifdef GRID_OMP uint64_t *cp = (uint64_t *)ptr;
#pragma omp parallel for thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0; cp[n]=0;
} });
#endif
return ptr; return ptr;
} }
@ -183,133 +205,40 @@ public:
profilerFree(bytes); profilerFree(bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme); if ( __freeme ) _mm_free((void *)__freeme);
#else #else
if ( __freeme ) free((void *)__freeme); if ( __freeme ) free((void *)__freeme);
#endif #endif
#endif
} }
void construct(pointer __p, const _Tp& __val) { };
// FIXME: hack for the copy constructor, eventually it must be avoided
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,commAllocator<T> >; template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
}; // namespace Grid NAMESPACE_END(Grid);
#endif #endif

View File

@ -30,16 +30,15 @@
#ifndef GRID_CARTESIAN_BASE_H #ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H #define GRID_CARTESIAN_BASE_H
NAMESPACE_BEGIN(Grid);
namespace Grid{
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid // Commicator provides information on the processor grid
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// unsigned long _ndimension; // unsigned long _ndimension;
// std::vector<int> _processors; // processor grid // Coordinate _processors; // processor grid
// int _processor; // linear processor rank // int _processor; // linear processor rank
// std::vector<int> _processor_coor; // linear processor rank // Coordinate _processor_coor; // linear processor rank
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread { class GridBase : public CartesianCommunicator , public GridThread {
@ -48,12 +47,14 @@ public:
// Give Lattice access // Give Lattice access
template<class object> friend class Lattice; template<class object> friend class Lattice;
GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {}; GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const std::vector<int> & processor_grid,
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent, const CartesianCommunicator &parent,
int &split_rank) int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {}; : CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const std::vector<int> & processor_grid,
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent) const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {}; : CartesianCommunicator(processor_grid,parent,dummy) {};
@ -61,23 +62,23 @@ public:
// Physics Grid information. // Physics Grid information.
std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes. Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
std::vector<int> _gdimensions;// Global dimensions of array after cb removal Coordinate _gdimensions;// Global dimensions of array after cb removal
std::vector<int> _ldimensions;// local dimensions of array with processor images removed Coordinate _ldimensions;// local dimensions of array with processor images removed
std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed Coordinate _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
std::vector<int> _ostride; // Outer stride for each dimension Coordinate _ostride; // Outer stride for each dimension
std::vector<int> _istride; // Inner stride i.e. within simd lane Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions). int _osites; // _isites*_osites = product(dimensions).
int _isites; int _isites;
int _fsites; // _isites*_osites = product(dimensions). int _fsites; // _isites*_osites = product(dimensions).
int _gsites; int _gsites;
std::vector<int> _slice_block;// subslice information Coordinate _slice_block;// subslice information
std::vector<int> _slice_stride; Coordinate _slice_stride;
std::vector<int> _slice_nblock; Coordinate _slice_nblock;
std::vector<int> _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d] Coordinate _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
std::vector<int> _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1 Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded; bool _isCheckerBoarded;
@ -88,7 +89,7 @@ public:
// GridCartesian / GridRedBlackCartesian // GridCartesian / GridRedBlackCartesian
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0; virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const std::vector<int> &site)=0; virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
@ -107,20 +108,20 @@ public:
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously. // lanes are operated upon simultaneously.
virtual int oIndex(std::vector<int> &coor) virtual int oIndex(Coordinate &coor)
{ {
int idx=0; int idx=0;
// Works with either global or local coordinates // Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx; return idx;
} }
virtual int iIndex(std::vector<int> &lcoor) virtual int iIndex(Coordinate &lcoor)
{ {
int idx=0; int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx; return idx;
} }
inline int oIndexReduced(std::vector<int> &ocoor) inline int oIndexReduced(Coordinate &ocoor)
{ {
int idx=0; int idx=0;
// ocoor is already reduced so can eliminate the modulo operation // ocoor is already reduced so can eliminate the modulo operation
@ -128,11 +129,11 @@ public:
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d]; for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx; return idx;
} }
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){ inline void oCoorFromOindex (Coordinate& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
} }
inline void InOutCoorToLocalCoor (std::vector<int> &ocoor, std::vector<int> &icoor, std::vector<int> &lcoor) { inline void InOutCoorToLocalCoor (Coordinate &ocoor, Coordinate &icoor, Coordinate &lcoor) {
lcoor.resize(_ndimension); lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d]; lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
@ -141,7 +142,7 @@ public:
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// SIMD lane addressing // SIMD lane addressing
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
inline void iCoorFromIindex(std::vector<int> &coor,int lane) inline void iCoorFromIindex(Coordinate &coor,int lane)
{ {
Lexicographic::CoorFromIndex(coor,lane,_simd_layout); Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
} }
@ -152,8 +153,6 @@ public:
inline int PermuteType(int dimension){ inline int PermuteType(int dimension){
int permute_type=0; int permute_type=0;
// //
// FIXME:
//
// Best way to encode this would be to present a mask // Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation // for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just // size. If there is only one simd dimension rotated, this is just
@ -186,11 +185,11 @@ public:
inline int gSites(void) const { return _isites*_osites*_Nprocessors; }; inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;}; inline int Nd (void) const { return _ndimension;};
inline const std::vector<int> LocalStarts(void) { return _lstart; }; inline const Coordinate LocalStarts(void) { return _lstart; };
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;}; inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;}; inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const std::vector<int> &LocalDimensions(void) { return _ldimensions;}; inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;}; inline const Coordinate &VirtualLocalDimensions(void) { return _ldimensions;};
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details // Utility to print the full decomposition details
@ -214,15 +213,15 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Global addressing // Global addressing
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){ void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
assert(gidx< gSites()); assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){ void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites()); assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
} }
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){ void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
gidx=0; gidx=0;
int mult=1; int mult=1;
for(int mu=0;mu<_ndimension;mu++) { for(int mu=0;mu<_ndimension;mu++) {
@ -230,7 +229,7 @@ public:
mult*=_gdimensions[mu]; mult*=_gdimensions[mu];
} }
} }
void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor) void GlobalCoorToProcessorCoorLocalCoor(Coordinate &pcoor,Coordinate &lcoor,const Coordinate &gcoor)
{ {
pcoor.resize(_ndimension); pcoor.resize(_ndimension);
lcoor.resize(_ndimension); lcoor.resize(_ndimension);
@ -240,14 +239,14 @@ public:
lcoor[mu] = gcoor[mu]%_fld; lcoor[mu] = gcoor[mu]%_fld;
} }
} }
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor) void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const Coordinate &gcoor)
{ {
std::vector<int> pcoor; Coordinate pcoor;
std::vector<int> lcoor; Coordinate lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor); rank = RankFromProcessorCoor(pcoor);
/* /*
std::vector<int> cblcoor(lcoor); Coordinate cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){ for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) { if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2; cblcoor[d] = lcoor[d]/2;
@ -258,10 +257,10 @@ public:
o_idx= oIndex(lcoor); o_idx= oIndex(lcoor);
} }
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor) void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , Coordinate &gcoor)
{ {
gcoor.resize(_ndimension); gcoor.resize(_ndimension);
std::vector<int> coor(_ndimension); Coordinate coor(_ndimension);
ProcessorCoorFromRank(rank,coor); ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
@ -273,20 +272,19 @@ public:
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
} }
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor) void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,Coordinate &fcoor)
{ {
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor); RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){ if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb; fcoor[0] = fcoor[0]*2+cb;
} }
} }
void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor) void ProcessorCoorLocalCoorToGlobalCoor(Coordinate &Pcoor,Coordinate &Lcoor,Coordinate &gcoor)
{ {
gcoor.resize(_ndimension); gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu]; for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
} }
}; };
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -28,13 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CARTESIAN_FULL_H #ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H #define GRID_CARTESIAN_FULL_H
namespace Grid{ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Grid Support. // Grid Support.
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase { class GridCartesian: public GridBase {
public: public:
@ -49,7 +48,7 @@ public:
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
return 0; return 0;
} }
virtual int CheckerBoard(const std::vector<int> &site){ virtual int CheckerBoard(const Coordinate &site){
return 0; return 0;
} }
virtual int CheckerBoardDestination(int cb,int shift,int dim){ virtual int CheckerBoardDestination(int cb,int shift,int dim){
@ -64,16 +63,16 @@ public:
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator. // Constructor takes a parent grid and possibly subdivides communicator.
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid, const Coordinate &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy) const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
} }
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid, const Coordinate &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank) const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
@ -81,18 +80,18 @@ public:
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Construct from comm world // Construct from comm world
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions, GridCartesian(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid) const Coordinate &processor_grid) : GridBase(processor_grid)
{ {
Init(dimensions,simd_layout,processor_grid); Init(dimensions,simd_layout,processor_grid);
} }
virtual ~GridCartesian() = default; virtual ~GridCartesian() = default;
void Init(const std::vector<int> &dimensions, void Init(const Coordinate &dimensions,
const std::vector<int> &simd_layout, const Coordinate &simd_layout,
const std::vector<int> &processor_grid) const Coordinate &processor_grid)
{ {
/////////////////////// ///////////////////////
// Grid information // Grid information
@ -170,5 +169,6 @@ public:
}; };
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -29,8 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CARTESIAN_RED_BLACK_H #ifndef GRID_CARTESIAN_RED_BLACK_H
#define GRID_CARTESIAN_RED_BLACK_H #define GRID_CARTESIAN_RED_BLACK_H
NAMESPACE_BEGIN(Grid);
namespace Grid {
static const int CbRed =0; static const int CbRed =0;
static const int CbBlack=1; static const int CbBlack=1;
@ -41,7 +40,7 @@ namespace Grid {
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
std::vector<int> _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
@ -49,7 +48,7 @@ public:
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(const std::vector<int> &site){ virtual int CheckerBoard(const Coordinate &site){
int linear=0; int linear=0;
assert(site.size()==_ndimension); assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
@ -59,7 +58,6 @@ public:
return (linear&0x1); return (linear&0x1);
} }
// Depending on the cb of site, we toggle source cb. // Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e) // for block #b, element #e = (b, e)
// we need // we need
@ -83,7 +81,7 @@ public:
} }
virtual int CheckerBoardFromOindex (int Oindex) virtual int CheckerBoardFromOindex (int Oindex)
{ {
std::vector<int> ocoor; Coordinate ocoor;
oCoorFromOindex(ocoor,Oindex); oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor); return CheckerBoard(ocoor);
} }
@ -118,7 +116,7 @@ public:
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base) GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{ {
int dims = base->_ndimension; int dims = base->_ndimension;
std::vector<int> checker_dim_mask(dims,1); Coordinate checker_dim_mask(dims,1);
int checker_dim = 0; int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim); Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
}; };
@ -127,7 +125,7 @@ public:
// Create redblack from original grid, with non-trivial checker dim mask // Create redblack from original grid, with non-trivial checker dim mask
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base, GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &checker_dim_mask, const Coordinate &checker_dim_mask,
int checker_dim int checker_dim
) : GridBase(base->_processors,*base) ) : GridBase(base->_processors,*base)
{ {
@ -135,40 +133,11 @@ public:
} }
virtual ~GridRedBlackCartesian() = default; virtual ~GridRedBlackCartesian() = default;
#if 0
////////////////////////////////////////////////////////////
// Create redblack grid ;; deprecate these. Should not
// need direct creation of redblack without a full grid to base on
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim
) : GridBase(processor_grid,*base)
{
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
//////////////////////////////////////////////////////////// void Init(const Coordinate &dimensions,
// Create redblack grid const Coordinate &simd_layout,
//////////////////////////////////////////////////////////// const Coordinate &processor_grid,
GridRedBlackCartesian(const GridBase *base, const Coordinate &checker_dim_mask,
const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid,*base)
{
std::vector<int> checker_dim_mask(dimensions.size(),1);
int checker_dim = 0;
Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
}
#endif
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const std::vector<int> &checker_dim_mask,
int checker_dim) int checker_dim)
{ {
@ -282,7 +251,7 @@ public:
}; };
protected: protected:
virtual int oIndex(std::vector<int> &coor) virtual int oIndex(Coordinate &coor)
{ {
int idx = 0; int idx = 0;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
@ -299,7 +268,7 @@ public:
return idx; return idx;
}; };
virtual int iIndex(std::vector<int> &lcoor) virtual int iIndex(Coordinate &lcoor)
{ {
int idx = 0; int idx = 0;
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
@ -316,5 +285,5 @@ public:
return idx; return idx;
} }
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_COMMUNICATOR_H #ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H #define GRID_COMMUNICATOR_H
#include <Grid/util/Coordinate.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h> #include <Grid/communicator/Communicator_base.h>

View File

@ -31,7 +31,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <limits.h> #include <limits.h>
#include <sys/mman.h> #include <sys/mman.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
@ -47,8 +47,8 @@ int CartesianCommunicator::Dimensions(void) { return
int CartesianCommunicator::IsBoss(void) { return _processor==0; }; int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; }; int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; }; int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; }; const Coordinate & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; }; const Coordinate & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; }; int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -72,5 +72,6 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);
} }
} NAMESPACE_END(Grid);

View File

@ -34,7 +34,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
class CartesianCommunicator : public SharedMemory { class CartesianCommunicator : public SharedMemory {
@ -52,9 +52,9 @@ public:
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
//////////////////////////////////////////// ////////////////////////////////////////////
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes. Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension; unsigned long _ndimension;
static Grid_MPI_Comm communicator_world; static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator; Grid_MPI_Comm communicator;
@ -69,8 +69,8 @@ public:
// Constructors to sub-divide a parent communicator // Constructors to sub-divide a parent communicator
// and default to comm world // and default to comm world
//////////////////////////////////////////////// ////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank); CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const std::vector<int> &pdimensions_in); CartesianCommunicator(const Coordinate &pdimensions_in);
virtual ~CartesianCommunicator(); virtual ~CartesianCommunicator();
private: private:
@ -79,7 +79,7 @@ public:
// Private initialise from an MPI communicator // Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private // Can use after an MPI_Comm_split, but hidden from user so private
//////////////////////////////////////////////// ////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base); void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base);
public: public:
@ -88,15 +88,15 @@ public:
// Wraps MPI_Cart routines, or implements equivalent on other impls // Wraps MPI_Cart routines, or implements equivalent on other impls
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest); void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor); int RankFromProcessorCoor(Coordinate &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor); void ProcessorCoorFromRank(int rank,Coordinate &coor);
int Dimensions(void) ; int Dimensions(void) ;
int IsBoss(void) ; int IsBoss(void) ;
int BossRank(void) ; int BossRank(void) ;
int ThisRank(void) ; int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ; const Coordinate & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ; const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ; int ProcessorCount(void) ;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -199,9 +199,10 @@ public:
template<class obj> void Broadcast(int root,obj &data) template<class obj> void Broadcast(int root,obj &data)
{ {
Broadcast(root,(void *)&data,sizeof(data)); Broadcast(root,(void *)&data,sizeof(data));
};
};
} }
};
NAMESPACE_END(Grid);
#endif #endif

View File

@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
@ -44,12 +44,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) || if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
assert(0); assert(0);
} }
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0);
}
}
// Never clean up as done once. // Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
@ -69,14 +74,14 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{ {
int rank; int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0); assert(ierr==0);
return rank; return rank;
} }
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{ {
coor.resize(_ndimension); coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
@ -86,7 +91,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world // Initialises from communicator_world
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
MPI_Comm optimal_comm; MPI_Comm optimal_comm;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
@ -105,12 +110,12 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
////////////////////////////////// //////////////////////////////////
// Try to subdivide communicator // Try to subdivide communicator
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
std::vector<int> parent_processor_coor(_ndimension,0); Coordinate parent_processor_coor(_ndimension,0);
std::vector<int> parent_processors (_ndimension,1); Coordinate parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc... // Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension; int pad = _ndimension-parent_ndimension;
@ -133,9 +138,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
int Nchild = Nparent/childsize; int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent); assert (childsize * Nchild == Nparent);
std::vector<int> ccoor(_ndimension); // coor within subcommunicator Coordinate ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent Coordinate scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent Coordinate ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d]; ccoor[d] = parent_processor_coor[d] % processors[d];
@ -152,36 +157,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
MPI_Comm comm_split; MPI_Comm comm_split;
if ( Nchild > 1 ) { if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Split the communicator // Split the communicator
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@ -220,7 +195,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
} }
} }
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base) void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base)
{ {
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo // Creates communicator, and the communicator_halo
@ -237,7 +212,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &proc
_Nprocessors*=_processors[i]; _Nprocessors*=_processors[i];
} }
std::vector<int> periodic(_ndimension,1); Coordinate periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator); MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor); MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@ -474,7 +449,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
std::vector<int> row(_ndimension,1); Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension); assert(dim>=0 && dim<_ndimension);
// Split the communicator // Split the communicator
@ -503,7 +478,6 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
MPI_Type_free(&object); MPI_Type_free(&object);
} }
NAMESPACE_END(Grid);
}

View File

@ -27,7 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
@ -42,14 +42,14 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
GlobalSharedMemory::Hugepages); GlobalSharedMemory::Hugepages);
} }
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors) : CartesianCommunicator(processors)
{ {
srank=0; srank=0;
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
} }
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
_processors = processors; _processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
@ -122,8 +122,8 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
source =0; source =0;
@ -160,6 +160,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};
NAMESPACE_END(Grid);
}

View File

@ -28,10 +28,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
// static data // static data
int GlobalSharedMemory::HPEhypercube = 1;
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL; uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0; int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup; int GlobalSharedMemory::_ShmSetup;
@ -76,6 +77,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl; std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr; return ptr;
} }
void SharedMemory::ShmBufferFreeAll(void) { void SharedMemory::ShmBufferFreeAll(void) {
@ -84,9 +86,9 @@ void SharedMemory::ShmBufferFreeAll(void) {
} }
void *SharedMemory::ShmBufferSelf(void) void *SharedMemory::ShmBufferSelf(void)
{ {
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank]; return ShmCommBufs[ShmRank];
} }
NAMESPACE_END(Grid);
}

View File

@ -25,18 +25,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once #pragma once
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
@ -57,7 +45,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <numaif.h> #include <numaif.h>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
@ -71,12 +59,18 @@ class GlobalSharedMemory {
private: private:
static const int MAXLOG2RANKSPERNODE = 16; static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation // Init once lock on the buffer allocation
static int _ShmSetup; static int _ShmSetup;
static int _ShmAlloc; static int _ShmAlloc;
static uint64_t _ShmAllocBytes; static uint64_t _ShmAllocBytes;
public: public:
///////////////////////////////////////
// HPE 8600 hypercube optimisation
///////////////////////////////////////
static int HPEhypercube;
static int ShmSetup(void) { return _ShmSetup; } static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; } static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; } static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
@ -102,14 +96,16 @@ class GlobalSharedMemory {
// Create an optimal reordered communicator that makes MPI_Cart_create get it right // Create an optimal reordered communicator that makes MPI_Cart_create get it right
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Provide shared memory facilities off comm world // Provide shared memory facilities off comm world
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
}; };
@ -150,6 +146,7 @@ class SharedMemory
// Call on any instance // Call on any instance
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
void SharedMemoryTest(void); void SharedMemoryTest(void);
void *ShmBufferSelf(void); void *ShmBufferSelf(void);
void *ShmBuffer (int rank); void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p); void *ShmBufferTranslate(int rank,void * local_p);
@ -164,4 +161,5 @@ class SharedMemory
}; };
} NAMESPACE_END(Grid);

View File

@ -29,8 +29,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
namespace Grid { #ifdef GRID_NVCC
#include <cuda_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
@ -46,6 +50,11 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank // WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes // WorldNodes
@ -130,7 +139,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
} }
return log2size; return log2size;
} }
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions // Look and see if it looks like an HPE 8600 based on hostname conventions
@ -143,10 +152,10 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
gethostname(name,namelen); gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3) OptimalCommunicatorHypercube(processors,optimal_comm); if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm); else OptimalCommunicatorSharedMemory(processors,optimal_comm);
} }
void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
@ -188,9 +197,9 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
} }
std::string hname(name); std::string hname(name);
std::cout << "hostname "<<hname<<std::endl; // std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N // std::cout << "R " << R << " I " << I << " N "<< N
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl; // << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition. // broadcast node 0's base coordinate for this partition.
@ -213,7 +222,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
std::vector<int> processor_coor(ndimension); std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension); std::vector<int> WorldDims = processors.toVector();
std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension); std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension); std::vector<int> HyperCoor(ndimension);
int dim = 0; int dim = 0;
@ -269,7 +279,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const std::vector<int> &pr
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); assert(ierr==0);
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
@ -282,9 +292,9 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int>
// in a maximally symmetrical way // in a maximally symmetrical way
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ndimension = processors.size(); int ndimension = processors.size();
std::vector<int> processor_coor(ndimension); Coordinate processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension); Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1); Coordinate NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension); Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
int dim = 0; int dim = 0;
for(int l2=0;l2<log2size;l2++){ for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
@ -330,7 +340,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const std::vector<int>
#ifdef GRID_MPI3_SHMGET #ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
@ -389,10 +399,97 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.
// The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.
// e.g. DGX1, supermicro board,
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
cudaSetDevice(WorldShmRank);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto err = cudaMalloc(&ShmCommBuf, bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
r,
WorldShmComm);
assert(ierr==0);
}
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -429,7 +526,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
@ -439,7 +536,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE #ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -486,7 +583,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);
@ -552,10 +649,27 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
} }
#endif #endif
#endif // End NVCC case for GPU device buffers
/////////////////////////////////////////////////////////////////////////
// Routines accessing shared memory should route through for GPU safety
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemset(dest,0,bytes);
#else
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else
bcopy(src,dest,bytes);
#endif
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@ -587,7 +701,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@ -600,6 +713,8 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier
@ -614,25 +729,27 @@ void SharedMemory::ShmBarrier(void)
void SharedMemory::SharedMemoryTest(void) void SharedMemory::SharedMemoryTest(void)
{ {
ShmBarrier(); ShmBarrier();
uint64_t check[3];
uint64_t magic = 0x5A5A5A;
if ( ShmRank == 0 ) { if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0]=GlobalSharedMemory::WorldNode; check[0]=GlobalSharedMemory::WorldNode;
check[1]=r; check[1]=r;
check[2] = 0x5A5A5A; check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(int r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r]; ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode); assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r); assert(check[1]==r);
assert(check[2]==0x5A5A5A); assert(check[2]==magic);
}
ShmBarrier(); ShmBarrier();
} }
}
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)
{ {
@ -645,7 +762,6 @@ void *SharedMemory::ShmBuffer(int rank)
} }
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
static int count =0;
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
@ -664,4 +780,5 @@ SharedMemory::~SharedMemory()
} }
}; };
} NAMESPACE_END(Grid);

View File

@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@ -47,7 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1; _ShmSetup=1;
} }
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
optimal_comm = WorldComm; optimal_comm = WorldComm;
} }
@ -125,4 +125,5 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
SharedMemory::~SharedMemory() SharedMemory::~SharedMemory()
{}; {};
} NAMESPACE_END(Grid);

View File

@ -25,10 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_CSHIFT_COMMON_H_ #pragma once
#define _GRID_CSHIFT_COMMON_H_
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
@ -36,20 +35,21 @@ namespace Grid {
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0; int ent = 0;
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static Vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs.Grid()->_slice_stride[dimension];
int stride=rhs._grid->_slice_stride[dimension]; auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -63,66 +63,68 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b); table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
parallel_for(int i=0;i<ent;i++){ thread_for(i,ent,{
buffer[table[i].first]=rhs._odata[table[i].second]; buffer[table[i].first]=rhs_v[table[i].second];
} });
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split // Gather for when there *is* need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) Gather_plane_extract(const Lattice<vobj> &rhs,
ExtractPointerArray<typename vobj::scalar_object> pointers,
int dimension,int plane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3; cbmask = 0x3;
} }
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
parallel_for_nest2(int n=0;n<e1;n++){ thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs._odata[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}
} }
});
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl; std::cout << " Dense packed buffer WARNING " <<std::endl;
parallel_for_nest2(int n=0;n<e1;n++){ thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o=n*n1; int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs._odata[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
} }
} });
} }
} }
@ -131,17 +133,17 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0; int ent =0;
@ -150,8 +152,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b); table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@ -160,8 +162,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int bo=0; int bo=0;
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++); table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
@ -169,48 +171,51 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
} }
} }
parallel_for(int i=0;i<ent;i++){ auto rhs_v = rhs.View();
rhs._odata[table[i].first]=buffer[table[i].second]; thread_for(i,ent,{
} rhs_v[table[i].first]=buffer[table[i].second];
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split // Scatter for when there *is* need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){ auto rhs_v = rhs.View();
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}
} }
});
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View();
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
} }
} }
} }
@ -222,18 +227,18 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0; int ent=0;
@ -248,7 +253,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o); table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
@ -256,32 +261,33 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
} }
} }
parallel_for(int i=0;i<ent;i++){ auto rhs_v = rhs.View();
lhs._odata[table[i].first]=rhs._odata[table[i].second]; auto lhs_v = lhs.View();
} thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3; cbmask=0x3;
} }
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block [dimension]; int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0; int ent=0;
double t_tab,t_perm;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -292,14 +298,16 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
parallel_for(int i=0;i<ent;i++){ auto rhs_v = rhs.View();
permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type); auto lhs_v = lhs.View();
} thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -309,10 +317,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
double t_local;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3); Cshift_local(ret,rhs,dimension,shift,0x3);
@ -324,7 +330,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
GridBase *grid = rhs._grid; GridBase *grid = rhs.Grid();
int fd = grid->_fdimensions[dimension]; int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension]; int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension]; int ld = grid->_ldimensions[dimension];
@ -335,18 +341,18 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
// the permute type // the permute type
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
int permute_dim =grid->PermuteDim(dimension); int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension); int permute_type=grid->PermuteType(dimension);
int permute_type_dist; int permute_type_dist;
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
int o = 0; // int o = 0;
int bo = x * grid->_ostride[dimension]; int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
int sx = (x+sshift)%rd; int sx = (x+sshift)%rd;
// wrap is whether sshift > rd. // wrap is whether sshift > rd.
@ -387,5 +393,5 @@ template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &r
} }
} }
} NAMESPACE_END(Grid);
#endif

View File

@ -30,27 +30,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs._grid); Lattice<vobj> ret(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension]; int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension. // Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type // the permute type
int simd_layout = rhs._grid->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim); int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
if ( !comm_dim ) { if ( !comm_dim ) {
@ -70,10 +70,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3); Cshift_comms(ret,rhs,dimension,shift,0x3);
@ -88,8 +88,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
{ {
int sshift[2]; int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
@ -107,25 +107,25 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs._grid; GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs._grid); Lattice<vobj> temp(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension]; int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs._grid->_processors[dimension]; int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1); assert(simd_layout==1);
assert(comm_dim==1); assert(comm_dim==1);
assert(shift>=0); assert(shift>=0);
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size); commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size); commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
@ -145,7 +145,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
@ -165,7 +165,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
GridBase *grid=rhs._grid; GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@ -193,21 +193,21 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// Simd direction uses an extract/merge pair // Simd direction uses an extract/merge pair
/////////////////////////////////////////////// ///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); // ExtractPointerArray<scalar_object> pointers(Nsimd); //
std::vector<scalar_object *> rpointers(Nsimd); // received pointers ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
/////////////////////////////////////////// ///////////////////////////////////////////
// Work out what to send where // Work out what to send where
/////////////////////////////////////////// ///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even; int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim // loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){ for(int x=0;x<rd;x++){
@ -258,5 +258,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
} }
} }
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -27,13 +27,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#ifndef _GRID_CSHIFT_NONE_H_ #ifndef _GRID_CSHIFT_NONE_H_
#define _GRID_CSHIFT_NONE_H_ #define _GRID_CSHIFT_NONE_H_
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
Lattice<vobj> ret(rhs._grid); Lattice<vobj> ret(rhs.Grid());
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
Cshift_local(ret,rhs,dimension,shift); Cshift_local(ret,rhs,dimension,shift);
return ret; return ret;
} }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,3 +1,4 @@
#ifndef __NVCC__
/* /*
__ _____ _____ _____ __ _____ _____ _____
__| | __| | | | JSON for Modern C++ __| | __| | | | JSON for Modern C++
@ -18918,3 +18919,4 @@ inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std
#endif #endif
#endif

View File

@ -25,9 +25,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_H #pragma once
#define GRID_LATTICE_H
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h>
#include <Grid/lattice/Lattice_arith.h>
#include <Grid/lattice/Lattice_trace.h>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h>
//#include <Grid/lattice/Lattice_where.h>
#include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#endif

View File

@ -36,13 +36,13 @@ directory
#include <typeinfo> #include <typeinfo>
#include <vector> #include <vector>
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const robj &iffalse) { const robj &iffalse) {
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
@ -51,11 +51,10 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd(); const int Nsimd = vobj::vector_type::Nsimd();
const int words = sizeof(vobj) / sizeof(vector_type);
std::vector<Integer> mask(Nsimd); ExtractBuffer<Integer> mask(Nsimd);
std::vector<scalar_object> truevals(Nsimd); ExtractBuffer<scalar_object> truevals(Nsimd);
std::vector<scalar_object> falsevals(Nsimd); ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals); extract(iftrue, truevals);
extract(iffalse, falsevals); extract(iffalse, falsevals);
@ -69,149 +68,139 @@ inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
return ret; return ret;
} }
//////////////////////////////////////////// /////////////////////////////////////////////////////
// recursive evaluation of expressions; Could
// switch to generic approach with variadics, a la
// Antonin's Lat Sim but the repack to variadic with popped
// from tuple is hideous; C++14 introduces std::make_index_sequence for this
////////////////////////////////////////////
// leaf eval of lattice ; should enable if protect using traits
template <typename T>
using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T>
using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
//Specialization of getVectorType for lattices //Specialization of getVectorType for lattices
/////////////////////////////////////////////////////
template<typename T> template<typename T>
struct getVectorType<Lattice<T> >{ struct getVectorType<Lattice<T> >{
typedef typename Lattice<T>::vector_object type; typedef typename Lattice<T>::vector_object type;
}; };
template<class sobj> ////////////////////////////////////////////
inline sobj eval(const unsigned int ss, const sobj &arg) //-- recursive evaluation of expressions; --
// handle leaves of syntax tree
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj>
inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) { template <class lobj> accelerator_inline
return arg._odata[ss]; const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{
return arg[ss];
}
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.View();
return view[ss];
} }
// handle nodes in syntax tree ///////////////////////////////////////////////////
template <typename Op, typename T1> // handle nodes in syntax tree- eval one operand
auto inline eval( ///////////////////////////////////////////////////
const unsigned int ss, template <typename Op, typename T1> accelerator_inline
const LatticeUnaryExpression<Op, T1> &expr) // eval one operand auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) { -> decltype(expr.op.func( eval(ss, expr.arg1)))
return expr.first.func(eval(ss, std::get<0>(expr.second))); {
return expr.op.func( eval(ss, expr.arg1) );
} }
///////////////////////
template <typename Op, typename T1, typename T2> // eval two operands
auto inline eval( ///////////////////////
const unsigned int ss, template <typename Op, typename T1, typename T2> accelerator_inline
const LatticeBinaryExpression<Op, T1, T2> &expr) // eval two operands auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
eval(ss, std::get<1>(expr.second)))) { {
return expr.first.func(eval(ss, std::get<0>(expr.second)), return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
eval(ss, std::get<1>(expr.second)));
} }
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> // eval three operands
auto inline eval(const unsigned int ss, ///////////////////////
const LatticeTrinaryExpression<Op, T1, T2, T3> template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
&expr) // eval three operands auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)), -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
eval(ss, std::get<1>(expr.second)), {
eval(ss, std::get<2>(expr.second)))) { return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)));
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the grid from an expression, ensuring conformable. This must follow a // Obtain the grid from an expression, ensuring conformable. This must follow a
// tree recursion // tree recursion; must retain grid pointer in the LatticeView class which sucks
// Use a different method, and make it void *.
// Perhaps a conformable method.
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <class T1, template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr> accelerator_inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
{ {
if (grid) { lat.Conformable(grid);
conformable(grid, lat._grid);
} }
grid = lat._grid;
} template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
template <class T1, accelerator_inline
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> void GridFromExpression(GridBase *&grid,const T1 &notlat) // non-lattice leaf
inline void GridFromExpression(GridBase *&grid,
const T1 &notlat) // non-lattice leaf
{} {}
template <typename Op, typename T1> template <typename Op, typename T1>
inline void GridFromExpression(GridBase *&grid, accelerator_inline
const LatticeUnaryExpression<Op, T1> &expr) { void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression<Op, T1> &expr)
GridFromExpression(grid, std::get<0>(expr.second)); // recurse {
GridFromExpression(grid, expr.arg1); // recurse
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1, typename T2>
inline void GridFromExpression( accelerator_inline
GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) { void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr)
GridFromExpression(grid, std::get<0>(expr.second)); // recurse {
GridFromExpression(grid, std::get<1>(expr.second)); GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, expr.arg2);
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void GridFromExpression( accelerator_inline
GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) { void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
GridFromExpression(grid, std::get<0>(expr.second)); // recurse {
GridFromExpression(grid, std::get<1>(expr.second)); GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, std::get<2>(expr.second)); GridFromExpression(grid, expr.arg2); // recurse
GridFromExpression(grid, expr.arg3); // recurse
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the CB from an expression, ensuring conformable. This must follow a // Obtain the CB from an expression, ensuring conformable. This must follow a
// tree recursion // tree recursion
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template <class T1, template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.checkerboard); assert(cb == lat.Checkerboard());
} }
cb = lat.checkerboard; cb = lat.Checkerboard();
// std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
} }
template <class T1, template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{ {
// std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
}
template <typename Op, typename T1>
inline void CBFromExpression(int &cb,
const LatticeUnaryExpression<Op, T1> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse
// std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1> inline
inline void CBFromExpression(int &cb, void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
const LatticeBinaryExpression<Op, T1, T2> &expr) { {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, std::get<1>(expr.second)); }
// std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, expr.arg2); // recurse AST
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void CBFromExpression( inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) { {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, std::get<1>(expr.second)); CBFromExpression(cb, expr.arg2); // recurse AST
CBFromExpression(cb, std::get<2>(expr.second)); CBFromExpression(cb, expr.arg3); // recurse AST
// std::cout<<GridLogMessage<<"Trinary node cb "<<cb<<std::endl;
} }
//////////////////////////////////////////// ////////////////////////////////////////////
@ -220,7 +209,7 @@ inline void CBFromExpression(
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> \ template <class arg> \
struct name { \ struct name { \
static auto inline func(const arg a) -> decltype(ret) { return ret; } \ static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
}; };
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
@ -253,16 +242,18 @@ GridUnopClass(UnaryExp, exp(a));
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \ template <class left, class right> \
struct name { \ struct name { \
static auto inline func(const left &lhs, const right &rhs) \ static auto accelerator_inline \
-> decltype(combination) const { \ func(const left &lhs, const right &rhs) \
-> decltype(combination) const \
{ \
return combination; \ return combination; \
} \ } \
} };
GridBinOpClass(BinaryAdd, lhs + rhs); GridBinOpClass(BinaryAdd, lhs + rhs);
GridBinOpClass(BinarySub, lhs - rhs); GridBinOpClass(BinarySub, lhs - rhs);
GridBinOpClass(BinaryMul, lhs *rhs); GridBinOpClass(BinaryMul, lhs *rhs);
GridBinOpClass(BinaryDiv, lhs /rhs); GridBinOpClass(BinaryDiv, lhs /rhs);
GridBinOpClass(BinaryAnd, lhs &rhs); GridBinOpClass(BinaryAnd, lhs &rhs);
GridBinOpClass(BinaryOr, lhs | rhs); GridBinOpClass(BinaryOr, lhs | rhs);
GridBinOpClass(BinaryAndAnd, lhs &&rhs); GridBinOpClass(BinaryAndAnd, lhs &&rhs);
@ -274,17 +265,18 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \ template <class predicate, class left, class right> \
struct name { \ struct name { \
static auto inline func(const predicate &pred, const left &lhs, \ static auto accelerator_inline \
const right &rhs) -> decltype(combination) const { \ func(const predicate &pred, const left &lhs, const right &rhs) \
-> decltype(combination) const \
{ \
return combination; \ return combination; \
} \ } \
} };
GridTrinOpClass( GridTrinOpClass(TrinaryWhere,
TrinaryWhere, (predicatedWhere<predicate,
(predicatedWhere<predicate, typename std::remove_reference<left>::type, typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs, typename std::remove_reference<right>::type>(pred, lhs,rhs)));
rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
@ -292,50 +284,32 @@ GridTrinOpClass(
#define GRID_UNOP(name) name<decltype(eval(0, arg))> #define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) \ #define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, \ template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
typename std::enable_if<is_lattice<T1>::value || \ inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
is_lattice_expr<T1>::value, \ { \
T1>::type * = nullptr> \ return LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
inline auto op(const T1 &arg) \
->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg))); \
} }
#define GRID_BINOP_LEFT(op, name) \ #define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value || \ typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
is_lattice_expr<T1>::value, \
T1>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \ inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype( \ ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs)) \
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \ { \
std::make_pair(GRID_BINOP(name)(), \ return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs);\
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_BINOP_RIGHT(op, name) \ #define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value && \ typename std::enable_if<!is_lattice<T1>::value&&!is_lattice_expr<T1>::value,T1>::type * = nullptr, \
!is_lattice_expr<T1>::value, \ typename std::enable_if< is_lattice<T2>::value|| is_lattice_expr<T2>::value,T2>::type * = nullptr> \
T1>::type * = nullptr, \
typename std::enable_if<is_lattice<T2>::value || \
is_lattice_expr<T2>::value, \
T2>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \ inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype( \ ->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs)) \
LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \ { \
std::make_pair(GRID_BINOP(name)(), \ return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs); \
std::forward_as_tuple(lhs, rhs)))) { \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_DEF_BINOP(op, name) \ #define GRID_DEF_BINOP(op, name) \
@ -345,18 +319,14 @@ GridTrinOpClass(
#define GRID_DEF_TRINOP(op, name) \ #define GRID_DEF_TRINOP(op, name) \
template <typename T1, typename T2, typename T3> \ template <typename T1, typename T2, typename T3> \
inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \ inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
->decltype( \ ->decltype(LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs)) \
LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \ { \
const T3 &>(std::make_pair( \ return LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs); \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) { \
return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs))); \
} }
//////////////////////// ////////////////////////
// Operator definitions // Operator definitions
//////////////////////// ////////////////////////
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
@ -400,29 +370,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> { -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret( {
expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
eval(0, std::get<1>(expr.second))))> { {
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
eval(0, std::get<1>(expr.second))))>
ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, std::get<1>(expr.second)), eval(0, expr.arg2),
eval(0, std::get<2>(expr.second))))> { eval(0, expr.arg3)))>
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), {
eval(0, std::get<1>(expr.second)), Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, std::get<2>(expr.second))))> eval(0, expr.arg2),
ret(expr); eval(0, expr.arg3)))> ret(expr);
return ret; return ret;
} }
@ -433,34 +401,7 @@ auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
#undef GRID_DEF_UNOP #undef GRID_DEF_UNOP
#undef GRID_DEF_BINOP #undef GRID_DEF_BINOP
#undef GRID_DEF_TRINOP #undef GRID_DEF_TRINOP
}
#if 0 NAMESPACE_END(Grid);
using namespace Grid;
int main(int argc,char **argv){
Lattice<double> v1(16);
Lattice<double> v2(16);
Lattice<double> v3(16);
BinaryAdd<double,double> tmp;
LatticeBinaryExpression<BinaryAdd<double,double>,Lattice<double> &,Lattice<double> &>
expr(std::make_pair(tmp,
std::forward_as_tuple(v1,v2)));
tmp.func(eval(0,v1),eval(0,v2));
auto var = v1+v2;
std::cout<<GridLogMessage<<typeid(var).name()<<std::endl;
v3=v1+v2;
v3=v1+v2+v1*v2;
};
void testit(Lattice<double> &v1,Lattice<double> &v2,Lattice<double> &v3)
{
v3=v1+v2+v1*v2;
}
#endif
#endif #endif

View File

@ -28,228 +28,230 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_ARITH_H #ifndef GRID_LATTICE_ARITH_H
#define GRID_LATTICE_ARITH_H #define GRID_LATTICE_ARITH_H
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add // avoid copy back routines for mult, mac, sub, add
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
#ifdef STREAMING_STORES decltype(coalescedRead(obj1())) tmp;
obj1 tmp; auto lhs_t = lhs_v(ss);
mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); auto rhs_t = rhs_v(ss);
vstream(ret._odata[ss],tmp); mult(&tmp,&lhs_t,&rhs_t);
#else coalescedWrite(ret_v[ss],tmp);
mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); });
#endif
}
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto lhs_v = lhs.View();
obj1 tmp; auto rhs_v = rhs.View();
mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
vstream(ret._odata[ss],tmp); decltype(coalescedRead(obj1())) tmp;
#else auto lhs_t=lhs_v(ss);
mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); auto rhs_t=rhs_v(ss);
#endif mac(&tmp,&lhs_t,&rhs_t);
} coalescedWrite(ret_v[ss],tmp);
});
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto lhs_v = lhs.View();
obj1 tmp; auto rhs_v = rhs.View();
sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
vstream(ret._odata[ss],tmp); decltype(coalescedRead(obj1())) tmp;
#else auto lhs_t=lhs_v(ss);
sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); auto rhs_t=rhs_v(ss);
#endif sub(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
} }
} template<class obj1,class obj2,class obj3> inline
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto lhs_v = lhs.View();
obj1 tmp; auto rhs_v = rhs.View();
add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
vstream(ret._odata[ss],tmp); decltype(coalescedRead(obj1())) tmp;
#else auto lhs_t=lhs_v(ss);
add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]); auto rhs_t=rhs_v(ss);
#endif add(&tmp,&lhs_t,&rhs_t);
} coalescedWrite(ret_v[ss],tmp);
});
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add // avoid copy back routines for mult, mac, sub, add
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
obj1 tmp; auto lhs_v = lhs.View();
mult(&tmp,&lhs._odata[ss],&rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
vstream(ret._odata[ss],tmp); decltype(coalescedRead(obj1())) tmp;
} mult(&tmp,&lhs_v(ss),&rhs);
coalescedWrite(ret_v[ss],tmp);
});
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
obj1 tmp; auto lhs_v = lhs.View();
mac(&tmp,&lhs._odata[ss],&rhs); accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
vstream(ret._odata[ss],tmp); decltype(coalescedRead(obj1())) tmp;
} auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto lhs_v = lhs.View();
obj1 tmp; accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
sub(&tmp,&lhs._odata[ss],&rhs); decltype(coalescedRead(obj1())) tmp;
vstream(ret._odata[ss],tmp); auto lhs_t=lhs_v(ss);
#else sub(&tmp,&lhs_t,&rhs);
sub(&ret._odata[ss],&lhs._odata[ss],&rhs); coalescedWrite(ret_v[ss],tmp);
#endif });
} }
} template<class obj1,class obj2,class obj3> inline
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.checkerboard = lhs.checkerboard; ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto lhs_v = lhs.View();
obj1 tmp; accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
add(&tmp,&lhs._odata[ss],&rhs); decltype(coalescedRead(obj1())) tmp;
vstream(ret._odata[ss],tmp); auto lhs_t=lhs_v(ss);
#else add(&tmp,&lhs_t,&rhs);
add(&ret._odata[ss],&lhs._odata[ss],&rhs); coalescedWrite(ret_v[ss],tmp);
#endif });
}
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add // avoid copy back routines for mult, mac, sub, add
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard; ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto rhs_v = lhs.View();
obj1 tmp; accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
mult(&tmp,&lhs,&rhs._odata[ss]); decltype(coalescedRead(obj1())) tmp;
vstream(ret._odata[ss],tmp); auto rhs_t=rhs_v(ss);
#else mult(&tmp,&lhs,&rhs_t);
mult(&ret._odata[ss],&lhs,&rhs._odata[ss]); coalescedWrite(ret_v[ss],tmp);
#endif });
}
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard; ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto rhs_v = lhs.View();
obj1 tmp; accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
mac(&tmp,&lhs,&rhs._odata[ss]); decltype(coalescedRead(obj1())) tmp;
vstream(ret._odata[ss],tmp); auto rhs_t=rhs_v(ss);
#else mac(&tmp,&lhs,&rhs_t);
mac(&ret._odata[ss],&lhs,&rhs._odata[ss]); coalescedWrite(ret_v[ss],tmp);
#endif });
}
} }
template<class obj1,class obj2,class obj3> strong_inline template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard; ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto rhs_v = lhs.View();
obj1 tmp; accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
sub(&tmp,&lhs,&rhs._odata[ss]); decltype(coalescedRead(obj1())) tmp;
vstream(ret._odata[ss],tmp); auto rhs_t=rhs_v(ss);
#else sub(&tmp,&lhs,&rhs_t);
sub(&ret._odata[ss],&lhs,&rhs._odata[ss]); coalescedWrite(ret_v[ss],tmp);
#endif });
} }
} template<class obj1,class obj2,class obj3> inline
template<class obj1,class obj2,class obj3> strong_inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.checkerboard = rhs.checkerboard; ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto rhs_v = lhs.View();
obj1 tmp; accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
add(&tmp,&lhs,&rhs._odata[ss]); decltype(coalescedRead(obj1())) tmp;
vstream(ret._odata[ss],tmp); auto rhs_t=rhs_v(ss);
#else add(&tmp,&lhs,&rhs_t);
add(&ret._odata[ss],&lhs,&rhs._odata[ss]); coalescedWrite(ret_v[ss],tmp);
#endif });
}
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto x_v = x.View();
vobj tmp = a*x._odata[ss]+y._odata[ss]; auto y_v = y.View();
vstream(ret._odata[ss],tmp); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
#else auto tmp = a*x_v(ss)+y_v(ss);
ret._odata[ss]=a*x._odata[ss]+y._odata[ss]; coalescedWrite(ret_v[ss],tmp);
#endif });
} }
} template<class sobj,class vobj> inline
template<class sobj,class vobj> strong_inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard; ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
parallel_for(int ss=0;ss<x._grid->oSites();ss++){ auto ret_v = ret.View();
#ifdef STREAMING_STORES auto x_v = x.View();
vobj tmp = a*x._odata[ss]+b*y._odata[ss]; auto y_v = y.View();
vstream(ret._odata[ss],tmp); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
#else auto tmp = a*x_v(ss)+b*y_v(ss);
ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss]; coalescedWrite(ret_v[ss],tmp);
#endif });
}
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
} }
template<class sobj,class vobj> strong_inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
} }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -28,311 +28,428 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_BASE_H #pragma once
#define GRID_LATTICE_BASE_H
#define STREAMING_STORES #define STREAMING_STORES
namespace Grid { NAMESPACE_BEGIN(Grid);
// TODO:
// mac,real,imag
// Functionality:
// -=,+=,*=,()
// add,+,sub,-,mult,mac,*
// adj,conjugate
// real,imag
// transpose,transposeIndex
// trace,traceIndex
// peekIndex
// innerProduct,outerProduct,
// localNorm2
// localInnerProduct
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
//////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Basic expressions used in Expression Template // Base class which can be used by traits to pick up behaviour
//////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
class LatticeBase {};
class LatticeBase /////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{ {
public: public:
virtual ~LatticeBase(void) = default;
GridBase *_grid;
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
}; };
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {}; class LatticeExpressionBase {};
template <typename Op, typename T1> template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase { template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
public:
LatticeUnaryExpression(const std::pair<Op,std::tuple<T1> > &arg): std::pair<Op,std::tuple<T1> >(arg) {};
};
template <typename Op, typename T1, typename T2> template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
class LatticeBinaryExpression : public std::pair<Op,std::tuple<T1,T2> > , public LatticeExpressionBase { template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
public: template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
LatticeBinaryExpression(const std::pair<Op,std::tuple<T1,T2> > &arg): std::pair<Op,std::tuple<T1,T2> >(arg) {};
};
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename _T1>
class LatticeTrinaryExpression :public std::pair<Op,std::tuple<T1,T2,T3> >, public LatticeExpressionBase { class LatticeUnaryExpression : public LatticeExpressionBase
public:
LatticeTrinaryExpression(const std::pair<Op,std::tuple<T1,T2,T3> > &arg): std::pair<Op,std::tuple<T1,T2,T3> >(arg) {};
};
void inline conformable(GridBase *lhs,GridBase *rhs)
{ {
assert((lhs == rhs) && " conformable check pointers mismatch "); public:
} typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
/////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
class Lattice : public LatticeBase class Lattice : public LatticeAccelerator<vobj>
{ {
public: public:
int checkerboard; GridBase *Grid(void) const { return this->_grid; }
Vector<vobj> _odata; ///////////////////////////////////////////////////
// Member types
// to pthread need a computable loop where loop induction is not required ///////////////////////////////////////////////////
int begin(void) { return 0;};
int end(void) { return _odata.size(); }
vobj & operator[](int i) { return _odata[i]; };
const vobj & operator[](int i) const { return _odata[i]; };
public:
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef vobj vector_object; typedef vobj vector_object;
private:
void dealloc(void)
{
alignedAllocator<vobj> alloc;
if( this->_odata_size ) {
alloc.deallocate(this->_odata,this->_odata_size);
this->_odata=nullptr;
this->_odata_size=0;
}
}
void resize(uint64_t size)
{
alignedAllocator<vobj> alloc;
if ( this->_odata_size != size ) {
dealloc();
}
this->_odata_size = size;
if ( size )
this->_odata = alloc.allocate(this->_odata_size);
else
this->_odata = nullptr;
}
public:
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas
/////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
~Lattice() {
if ( this->_odata_size ) {
dealloc();
}
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Expression Template closure support // Expression Template closure support
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr) template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){ auto me = View();
#ifdef STREAMING_STORES accelerator_for(ss,me.size(),1,{
vobj tmp = eval(ss,expr); auto tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp); vstream(me[ss],tmp);
#else });
_odata[ss]=eval(ss,expr);
#endif
}
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
parallel_for(int ss=0;ss<_grid->oSites();ss++){ auto me = View();
#ifdef STREAMING_STORES accelerator_for(ss,me.size(),1,{
vobj tmp = eval(ss,expr); auto tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp); vstream(me[ss],tmp);
#else });
_odata[ss]=eval(ss,expr);
#endif
}
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{ {
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
conformable(_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
auto me = View();
parallel_for(int ss=0;ss<_grid->oSites();ss++){ accelerator_for(ss,me.size(),1,{
#ifdef STREAMING_STORES auto tmp = eval(ss,expr);
//vobj tmp = eval(ss,expr); vstream(me[ss],tmp);
vstream(_odata[ss] ,eval(ss,expr)); });
#else
_odata[ss] = eval(ss,expr);
#endif
}
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
template<class Op,class T1> template<class Op,class T1>
Lattice(const LatticeUnaryExpression<Op,T1> & expr) { Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
_grid = nullptr; this->_grid = nullptr;
GridFromExpression(_grid,expr); GridFromExpression(this->_grid,expr);
assert(_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
_odata.resize(_grid->oSites()); resize(this->_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES *this = expr;
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
} }
};
template<class Op,class T1, class T2> template<class Op,class T1, class T2>
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
_grid = nullptr; this->_grid = nullptr;
GridFromExpression(_grid,expr); GridFromExpression(this->_grid,expr);
assert(_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
_odata.resize(_grid->oSites()); resize(this->_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
#ifdef STREAMING_STORES *this = expr;
vobj tmp = eval(ss,expr);
vstream(_odata[ss] ,tmp);
#else
_odata[ss]=eval(ss,expr);
#endif
} }
};
template<class Op,class T1, class T2, class T3> template<class Op,class T1, class T2, class T3>
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
_grid = nullptr; this->_grid = nullptr;
GridFromExpression(_grid,expr); GridFromExpression(this->_grid,expr);
assert(_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
checkerboard=cb; this->checkerboard=cb;
_odata.resize(_grid->oSites()); resize(this->_grid->oSites());
parallel_for(int ss=0;ss<_grid->oSites();ss++){
vstream(_odata[ss] ,eval(ss,expr)); *this = expr;
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View();
thread_for(ss,me.size(),{
me[ss] = r;
});
return *this;
} }
};
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// Constructor requires "grid" passed. // Follow rule of five, with Constructor requires "grid" passed
// what about a default grid? // to user defined constructor
////////////////////////////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid) : _odata(grid->oSites()) { // user defined constructor
_grid = grid; ///////////////////////////////////////////
// _odata.reserve(_grid->oSites()); Lattice(GridBase *grid) {
// _odata.resize(_grid->oSites()); this->_grid = grid;
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl; resize(this->_grid->oSites());
assert((((uint64_t)&_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
checkerboard=0; this->checkerboard=0;
} }
Lattice(const Lattice& r){ // copy constructor // virtual ~Lattice(void) = default;
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
Lattice(Lattice&& r){ // move constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata=std::move(r._odata);
}
inline Lattice<vobj> & operator = (Lattice<vobj> && r)
{
_grid = r._grid;
checkerboard = r.checkerboard;
_odata =std::move(r._odata);
return *this;
}
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
virtual ~Lattice(void) = default;
void reset(GridBase* grid) { void reset(GridBase* grid) {
if (_grid != grid) { if (this->_grid != grid) {
_grid = grid; this->_grid = grid;
_odata.resize(grid->oSites()); this->_odata.resize(grid->oSites());
checkerboard = 0; this->checkerboard = 0;
} }
} }
///////////////////////////////////////////
// copy constructor
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ ///////////////////////////////////////////
parallel_for(int ss=0;ss<_grid->oSites();ss++){ Lattice(const Lattice& r){
this->_odata[ss]=r; // std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid();
resize(this->_grid->oSites());
*this = r;
} }
///////////////////////////////////////////
// move constructor
///////////////////////////////////////////
Lattice(Lattice && r){
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
}
///////////////////////////////////////////
// assignment template
///////////////////////////////////////////
template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this; return *this;
} }
///////////////////////////////////////////
// Copy assignment
///////////////////////////////////////////
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this;
}
///////////////////////////////////////////
// Move assignment possible if same type
///////////////////////////////////////////
inline Lattice<vobj> & operator = (Lattice<vobj> && r){
resize(0); // deletes if appropriate
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
return *this;
}
/////////////////////////////////////////////////////////////////////////////
// *=,+=,-= operators inherit behvour from correspond */+/- operation // *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) { /////////////////////////////////////////////////////////////////////////////
template<class T> inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r; *this = (*this)*r;
return *this; return *this;
} }
template<class T> strong_inline Lattice<vobj> &operator -=(const T &r) { template<class T> inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r; *this = (*this)-r;
return *this; return *this;
} }
template<class T> strong_inline Lattice<vobj> &operator +=(const T &r) { template<class T> inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r; *this = (*this)+r;
return *this; return *this;
} }
friend inline void swap(Lattice &l, Lattice &r) {
conformable(l,r);
LatticeAccelerator<vobj> tmp;
LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
LatticeAccelerator<vobj> *rp = (LatticeAccelerator<vobj> *)&r;
tmp = *lp; *lp=*rp; *rp=tmp;
}
}; // class Lattice }; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
std::vector<int> gcoor;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
sobj ss; sobj ss;
for(int g=0;g<o._grid->_gsites;g++){
o._grid->GlobalIndexToGlobalCoor(g,gcoor);
peekSite(ss,o,gcoor); peekSite(ss,o,gcoor);
stream<<"["; stream<<"[";
for(int d=0;d<gcoor.size();d++){ for(int d=0;d<gcoor.size();d++){
@ -345,31 +462,5 @@ public:
return stream; return stream;
} }
} NAMESPACE_END(Grid);
#include "Lattice_conformable.h"
#define GRID_LATTICE_EXPRESSION_TEMPLATES
#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES
#include "Lattice_ET.h"
#else
#include "Lattice_overload.h"
#endif
#include "Lattice_arith.h"
#include "Lattice_trace.h"
#include "Lattice_transpose.h"
#include "Lattice_local.h"
#include "Lattice_reduction.h"
#include "Lattice_peekpoke.h"
#include "Lattice_reality.h"
#include "Lattice_comparison_utils.h"
#include "Lattice_comparison.h"
#include "Lattice_coordinate.h"
#include "Lattice_where.h"
#include "Lattice_rng.h"
#include "Lattice_unary.h"
#include "Lattice_transfer.h"
#endif

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_COMPARISON_H #ifndef GRID_LATTICE_COMPARISON_H
#define GRID_LATTICE_COMPARISON_H #define GRID_LATTICE_COMPARISON_H
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// relational operators // relational operators
@ -40,40 +40,78 @@ namespace Grid {
//Query supporting logical &&, ||, //Query supporting logical &&, ||,
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
*/
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to lattice // compare lattice to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj> template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vInteger> ret(rhs._grid); Lattice<vPredicate> ret(rhs.Grid());
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ auto lhs_v = lhs.View();
ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); auto rhs_v = rhs.View();
} auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
});
return ret; return ret;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to scalar // compare lattice to scalar
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj> template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vInteger> ret(lhs._grid); Lattice<vPredicate> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){ auto lhs_v = lhs.View();
ret._odata[ss]=op(lhs._odata[ss],rhs); auto ret_v = ret.View();
} thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs);
});
return ret; return ret;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare scalar to lattice // compare scalar to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj> template<class vfunctor,class lobj,class robj>
inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vInteger> ret(rhs._grid); Lattice<vPredicate> ret(rhs.Grid());
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ auto rhs_v = rhs.View();
ret._odata[ss]=op(lhs._odata[ss],rhs); auto ret_v = ret.View();
} thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]);
});
return ret; return ret;
} }
@ -82,88 +120,88 @@ namespace Grid {
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Less than // Less than
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs); return LLComparison(vlt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator < (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs); return LSComparison(vlt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator < (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs); return SLComparison(vlt<lobj,robj>(),lhs,rhs);
} }
// Less than equal // Less than equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs); return LLComparison(vle<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs); return LSComparison(vle<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator <= (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs); return SLComparison(vle<lobj,robj>(),lhs,rhs);
} }
// Greater than // Greater than
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs); return LLComparison(vgt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator > (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs); return LSComparison(vgt<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator > (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs); return SLComparison(vgt<lobj,robj>(),lhs,rhs);
} }
// Greater than equal // Greater than equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs); return LLComparison(vge<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs); return LSComparison(vge<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator >= (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs); return SLComparison(vge<lobj,robj>(),lhs,rhs);
} }
// equal // equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs); return LLComparison(veq<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator == (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs); return LSComparison(veq<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator == (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs); return SLComparison(veq<lobj,robj>(),lhs,rhs);
} }
// not equal // not equal
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs); return LLComparison(vne<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator != (const Lattice<lobj> & lhs, const robj & rhs) { inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs); return LSComparison(vne<lobj,robj>(),lhs,rhs);
} }
template<class lobj,class robj> template<class lobj,class robj>
inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) { inline Lattice<vPredicate> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs); return SLComparison(vne<lobj,robj>(),lhs,rhs);
} }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -26,10 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_COMPARISON_H
#define GRID_COMPARISON_H
namespace Grid { #pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////// /////////////////////////////////////////
// This implementation is a bit poor. // This implementation is a bit poor.
@ -44,42 +44,42 @@ namespace Grid {
// //
template<class lobj,class robj> class veq { template<class lobj,class robj> class veq {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) == (rhs); return (lhs) == (rhs);
} }
}; };
template<class lobj,class robj> class vne { template<class lobj,class robj> class vne {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) != (rhs); return (lhs) != (rhs);
} }
}; };
template<class lobj,class robj> class vlt { template<class lobj,class robj> class vlt {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) < (rhs); return (lhs) < (rhs);
} }
}; };
template<class lobj,class robj> class vle { template<class lobj,class robj> class vle {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) <= (rhs); return (lhs) <= (rhs);
} }
}; };
template<class lobj,class robj> class vgt { template<class lobj,class robj> class vgt {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) > (rhs); return (lhs) > (rhs);
} }
}; };
template<class lobj,class robj> class vge { template<class lobj,class robj> class vge {
public: public:
vInteger operator()(const lobj &lhs, const robj &rhs) accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) >= (rhs); return (lhs) >= (rhs);
} }
@ -88,42 +88,42 @@ namespace Grid {
// Generic list of functors // Generic list of functors
template<class lobj,class robj> class seq { template<class lobj,class robj> class seq {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) == (rhs); return (lhs) == (rhs);
} }
}; };
template<class lobj,class robj> class sne { template<class lobj,class robj> class sne {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) != (rhs); return (lhs) != (rhs);
} }
}; };
template<class lobj,class robj> class slt { template<class lobj,class robj> class slt {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) < (rhs); return (lhs) < (rhs);
} }
}; };
template<class lobj,class robj> class sle { template<class lobj,class robj> class sle {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) <= (rhs); return (lhs) <= (rhs);
} }
}; };
template<class lobj,class robj> class sgt { template<class lobj,class robj> class sgt {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) > (rhs); return (lhs) > (rhs);
} }
}; };
template<class lobj,class robj> class sge { template<class lobj,class robj> class sge {
public: public:
Integer operator()(const lobj &lhs, const robj &rhs) accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{ {
return (lhs) >= (rhs); return (lhs) >= (rhs);
} }
@ -133,12 +133,12 @@ namespace Grid {
// Integer and real get extra relational functions. // Integer and real get extra relational functions.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs) accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<scalar> vrhs(vsimd::Nsimd()); ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
std::vector<Integer> vpred(vsimd::Nsimd()); ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(lhs,vlhs); extract<vsimd,scalar>(lhs,vlhs);
extract<vsimd,scalar>(rhs,vrhs); extract<vsimd,scalar>(rhs,vrhs);
@ -150,11 +150,11 @@ namespace Grid {
} }
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs) accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd()); ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(lhs,vlhs); extract<vsimd,scalar>(lhs,vlhs);
for(int s=0;s<vsimd::Nsimd();s++){ for(int s=0;s<vsimd::Nsimd();s++){
@ -165,11 +165,11 @@ namespace Grid {
} }
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0> template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs) accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
{ {
typedef typename vsimd::scalar_type scalar; typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd()); ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret; vInteger ret;
extract<vsimd,scalar>(rhs,vrhs); extract<vsimd,scalar>(rhs,vrhs);
for(int s=0;s<vsimd::Nsimd();s++){ for(int s=0;s<vsimd::Nsimd();s++){
@ -181,30 +181,30 @@ namespace Grid {
#define DECLARE_RELATIONAL_EQ(op,functor) \ #define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \ accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd,IfSimd<vsimd> = 0>\ template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \ accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
{\ {\
typedef typename vsimd::scalar_type scalar;\ typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\ return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\ }\
template<class vsimd>\ template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \ accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \ { \
return lhs._internal op rhs; \ return lhs._internal op rhs; \
} \ } \
template<class vsimd>\ template<class vsimd>\
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \ accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \ { \
return lhs op rhs._internal; \ return lhs op rhs._internal; \
} \ } \
@ -212,7 +212,7 @@ namespace Grid {
#define DECLARE_RELATIONAL(op,functor) \ #define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \ DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\ template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\ accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \ { \
return lhs._internal op rhs._internal; \ return lhs._internal op rhs._internal; \
} }
@ -226,7 +226,7 @@ DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL #undef DECLARE_RELATIONAL
} NAMESPACE_END(Grid);
#endif

View File

@ -28,13 +28,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_CONFORMABLE_H #ifndef GRID_LATTICE_CONFORMABLE_H
#define GRID_LATTICE_CONFORMABLE_H #define GRID_LATTICE_CONFORMABLE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{ {
assert(lhs._grid == rhs._grid); assert(lhs.Grid() == rhs.Grid());
assert(lhs.checkerboard == rhs.checkerboard); assert(lhs.Checkerboard() == rhs.Checkerboard());
} }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -25,32 +25,50 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_COORDINATE_H #pragma once
#define GRID_LATTICE_COORDINATE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu) template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
{ {
typedef typename iobj::scalar_type scalar_type; typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type; typedef typename iobj::vector_type vector_type;
GridBase *grid = l._grid; GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
std::vector<int> gcoor; Coordinate gcoor;
std::vector<scalar_type> mergebuf(Nsimd); ExtractBuffer<scalar_type> mergebuf(Nsimd);
vector_type vI; vector_type vI;
auto l_v = l.View();
for(int o=0;o<grid->oSites();o++){ for(int o=0;o<grid->oSites();o++){
for(int i=0;i<grid->iSites();i++){ for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor); grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu]; mergebuf[i]=(Integer)gcoor[mu];
} }
merge<vector_type,scalar_type>(vI,mergebuf); merge<vector_type,scalar_type>(vI,mergebuf);
l._odata[o]=vI; l_v[o]=vI;
} }
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
} }
#endif }}
}
NAMESPACE_END(Grid);

View File

@ -32,7 +32,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// localInner, localNorm, outerProduct // localInner, localNorm, outerProduct
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// Non site, reduced locally reduced routines // Non site, reduced locally reduced routines
@ -42,10 +42,12 @@ namespace Grid {
template<class vobj> template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs._grid); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ auto rhs_v = rhs.View();
ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); auto ret_v = ret.View();
} accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
});
return ret; return ret;
} }
@ -53,23 +55,33 @@ namespace Grid {
template<class vobj> template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs._grid); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ auto lhs_v = lhs.View();
ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); auto rhs_v = rhs.View();
} auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
});
return ret; return ret;
} }
// outerProduct Scalar x Scalar -> Scalar // outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix // Vector x Vector -> Matrix
template<class ll,class rr> template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(ll(),rr()))>
{ {
Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); typedef decltype(coalescedRead(ll())) sll;
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ typedef decltype(coalescedRead(rr())) srr;
ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
} auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop
ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]);
});
return ret; return ret;
} }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -0,0 +1,202 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
NAMESPACE_BEGIN(Grid);
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto Y_v = Y.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_loop_collapse2((int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
ComplexD z = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(z),imag(z));
}}
}});
thread_critical {
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
NAMESPACE_END(Grid);

View File

@ -1,138 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_overload.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_OVERLOAD_H
#define GRID_LATTICE_OVERLOAD_H
namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////////
// unary negation
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> operator -(const Lattice<vobj> &r)
{
Lattice<vobj> ret(r._grid);
parallel_for(int ss=0;ss<r._grid->oSites();ss++){
vstream(ret._odata[ss], -r._odata[ss]);
}
return ret;
}
/////////////////////////////////////////////////////////////////////////////////////
// Lattice BinOp Lattice,
//NB mult performs conformable check. Do not reapply here for performance.
/////////////////////////////////////////////////////////////////////////////////////
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]*rhs._odata[0])> ret(rhs._grid);
mult(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]+rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]+rhs._odata[0])> ret(rhs._grid);
add(ret,lhs,rhs);
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]-rhs._odata[0])>
{
Lattice<decltype(lhs._odata[0]-rhs._odata[0])> ret(rhs._grid);
sub(ret,lhs,rhs);
return ret;
}
// Scalar BinOp Lattice ;generate return type
template<class left,class right>
inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])>
{
Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs*rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])>
{
Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs+rhs._odata[ss];
}
return ret;
}
template<class left,class right>
inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])>
{
Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];
vstream(ret._odata[ss],tmp);
}
return ret;
}
template<class left,class right>
inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)>
{
Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){
decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]*rhs;
}
return ret;
}
template<class left,class right>
inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)>
{
Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]+rhs;
}
return ret;
}
template<class left,class right>
inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)>
{
Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid);
parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){
decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs;
vstream(ret._odata[ss],tmp);
// ret._odata[ss]=lhs._odata[ss]-rhs;
}
return ret;
}
}
#endif

View File

@ -34,29 +34,35 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
// Peeking and poking around // Peeking and poking around
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
// FIXME accelerator_loop and accelerator_inline these
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Peek internal indices of a Lattice object // Peek internal indices of a Lattice object
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(vobj(),i))>
{ {
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid); Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.checkerboard=lhs.checkerboard; ret.Checkerboard()=lhs.Checkerboard();
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i); auto lhs_v = lhs.View();
} thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
});
return ret; return ret;
}; };
template<int Index,class vobj> template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(vobj(),i,j))>
{ {
Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.checkerboard=lhs.checkerboard; ret.Checkerboard()=lhs.Checkerboard();
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j); auto lhs_v = lhs.View();
} thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
});
return ret; return ret;
}; };
@ -64,34 +70,38 @@ namespace Grid {
// Poke internal indices of a Lattice object // Poke internal indices of a Lattice object
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{ {
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto rhs_v = rhs.View();
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i); auto lhs_v = lhs.View();
} thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
});
} }
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{ {
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto rhs_v = rhs.View();
pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j); auto lhs_v = lhs.View();
} thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
});
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array // Poke a scalar object into the SIMD array
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj,class sobj> template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const std::vector<int> &site){ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l._grid; GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx; int rank,odx,idx;
@ -99,13 +109,13 @@ namespace Grid {
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s); grid->Broadcast(grid->BossRank(),s);
std::vector<sobj> buf(Nsimd);
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l._odata[odx],buf); extract(l_v[odx],buf);
buf[idx] = s; buf[idx] = s;
merge(l._odata[odx],buf); merge(l_v[odx],buf);
} }
return; return;
@ -116,22 +126,23 @@ namespace Grid {
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
template<class vobj,class sobj> template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const std::vector<int> &site){ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l._grid; GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.checkerboard == l._grid->CheckerBoard(site)); assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx; int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
std::vector<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
extract(l._odata[odx],buf); auto l_v = l.View();
extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@ -145,16 +156,16 @@ namespace Grid {
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
template<class vobj,class sobj> template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){ void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
GridBase *grid = l._grid; GridBase *grid = l.Grid();
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@ -162,7 +173,8 @@ namespace Grid {
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx]; auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@ -173,16 +185,16 @@ namespace Grid {
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){ void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
GridBase *grid=l._grid; GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@ -190,9 +202,9 @@ namespace Grid {
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l._odata[odx]; auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; vp[idx+w*Nsimd] = pt[w];
} }
@ -200,6 +212,6 @@ namespace Grid {
return; return;
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -36,22 +36,28 @@ Author: neo <cossu@post.kek.jp>
// The choice of burying complex in the SIMD // The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome // is making the use of "real" and "imag" very cumbersome
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid); Lattice<vobj> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto lhs_v = lhs.View();
ret._odata[ss] = adj(lhs._odata[ss]); auto ret_v = ret.View();
} accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
});
return ret; return ret;
}; };
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid); Lattice<vobj> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto lhs_v = lhs.View();
ret._odata[ss] = conjugate(lhs._odata[ss]); auto ret_v = ret.View();
} accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
});
return ret; return ret;
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -19,22 +19,76 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_REDUCTION_H #pragma once
#define GRID_LATTICE_REDUCTION_H
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
namespace Grid {
#ifdef GRID_WARN_SUBOPTIMAL #ifdef GRID_NVCC
#warning "Optimisation alert all these reduction loops are NOT threaded " #include <Grid/lattice/Lattice_reduction_gpu.h>
#endif #endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
// FIXME this should promote to double and accumulate
//////////////////////////////////////////////////////
template<class vobj>
inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto arg_v = arg.View();
Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations // Deterministic Reduction operations
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
auto nrm = innerProduct(arg,arg); ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm); return real(nrm);
} }
// Double inner product // Double inner product
@ -43,32 +97,49 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
GridBase *grid = left._grid; ComplexD nrm;
const int pad = 8;
ComplexD inner; GridBase *grid = left.Grid();
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ // Might make all code paths go this way.
int nwork, mywork, myoff; auto left_v = left.View();
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); auto right_v=right.View();
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation const uint64_t nsimd = grid->Nsimd();
for(int ss=myoff;ss<mywork+myoff; ss++){ const uint64_t sites = grid->oSites();
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
}
// All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
inner=0.0; #ifdef GRID_NVCC
for(int i=0;i<grid->SumArraySize();i++){ // GPU - SIMT lane compliance...
inner = inner+sumarray[i*pad]; typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
} Vector<inner_t> inner_tmp(sites);
right._grid->GlobalSum(inner); auto inner_tmp_v = &inner_tmp[0];
return inner;
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
return nrm;
} }
///////////////////////// /////////////////////////
@ -86,8 +157,7 @@ axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj
template<class sobj,class vobj> strong_inline RealD template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
const int pad = 8; z.Checkerboard() = x.Checkerboard();
z.checkerboard = x.checkerboard;
conformable(z,x); conformable(z,x);
conformable(x,y); conformable(x,y);
@ -95,43 +165,57 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
RealD nrm; RealD nrm;
GridBase *grid = x._grid; GridBase *grid = x.Grid();
Vector<RealD> sumarray(grid->SumArraySize()*pad); auto x_v=x.View();
auto y_v=y.View();
auto z_v=z.View();
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ const uint64_t nsimd = grid->Nsimd();
int nwork, mywork, myoff; const uint64_t sites = grid->oSites();
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
// private to thread; sub summation #ifdef GRID_NVCC
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero; // GPU
for(int ss=myoff;ss<mywork+myoff; ss++){ typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
vobj tmp = a*x._odata[ss]+b*y._odata[ss]; Vector<inner_t> inner_tmp(sites);
vnrm = vnrm + innerProductD(tmp,tmp); auto inner_tmp_v = &inner_tmp[0];
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast accelerator_for( ss, sites, nsimd,{
for(int i=0;i<grid->SumArraySize();i++){ auto tmp = a*x_v(ss)+b*y_v(ss);
nrm = nrm+sumarray[i*pad]; coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
} coalescedWrite(z_v[ss],tmp);
z._grid->GlobalSum(nrm); });
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
});
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
template<class Op,class T1,class T2> template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr) inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object ->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
@ -139,54 +223,14 @@ inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
template<class Op,class T1,class T2,class T3> template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)), ->typename decltype(expr.op.func(eval(0,expr.arg1),
eval(0,std::get<1>(expr.second)), eval(0,expr.arg2),
eval(0,std::get<2>(expr.second)) eval(0,expr.arg3)
))::scalar_object ))::scalar_object
{ {
return sum(closure(expr)); return sum(closure(expr));
} }
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -199,7 +243,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields // But easily avoided by using double precision fields
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid; GridBase *grid = Data.Grid();
assert(grid!=NULL); assert(grid!=NULL);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
@ -212,13 +256,13 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first Vector<vobj> lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=zero; lvSum[r]=Zero();
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
@ -227,20 +271,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
parallel_for(int r=0;r<rd;r++){ auto Data_v=Data.View();
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data._odata[ss]; lvSum[r]=lvSum[r]+Data_v[ss];
}
} }
} }
});
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd); Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
@ -265,7 +308,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
if ( pt == grid->_processor_coor[orthogdim] ) { if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt]; gsum=lsSum[lt];
} else { } else {
gsum=zero; gsum=Zero();
} }
grid->GlobalSum(gsum); grid->GlobalSum(gsum);
@ -292,9 +335,9 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
// std::cout << GridLogMessage << "Start prep" << std::endl; // std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid; GridBase *grid = lhs.Grid();
assert(grid!=NULL); assert(grid!=NULL);
conformable(grid,rhs._grid); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
@ -307,14 +350,14 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl; // std::cout << GridLogMessage << "Start alloc" << std::endl;
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first Vector<vector_type> lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type>> extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl; // std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=zero; lvSum[r]=Zero();
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
@ -323,23 +366,24 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
// std::cout << GridLogMessage << "End prep" << std::endl; // std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl; // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv; vector_type vv;
parallel_for(int r=0;r<rd;r++) auto l_v=lhs.View();
{ auto r_v=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss = so + n * stride + b; int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss])); vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
lvSum[r] = lvSum[r] + vv; lvSum[r] = lvSum[r] + vv;
} }
} }
} });
// std::cout << GridLogMessage << "End parallel inner product" << std::endl; // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd); Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp; iScalar<vector_type> temp;
@ -362,7 +406,7 @@ template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim) static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid; GridBase *grid = lhs.Grid();
int fd = result.size(); int fd = result.size();
int ld = lsSum.size(); int ld = lsSum.size();
// sum over nodes. // sum over nodes.
@ -388,9 +432,9 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid; GridBase *grid = lhs.Grid();
assert(grid!=NULL); assert(grid!=NULL);
conformable(grid,rhs._grid); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
@ -402,34 +446,36 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first Vector<vector_type> lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
lvSum[r]=zero; lvSum[r]=Zero();
} }
int e1= grid->_slice_nblock[orthogdim]; int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
parallel_for(int r=0;r<rd;r++){ auto lhv=lhs.View();
auto rhv=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss])); vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss]));
lvSum[r]=lvSum[r]+vv; lvSum[r]=lvSum[r]+vv;
} }
} }
} });
// Sum across simd lanes in the plane, breaking out orthog dir. // Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd); Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){ for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp; iScalar<vector_type> temp;
@ -470,7 +516,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog]; int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock); std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock); sn.resize(Nblock);
@ -492,7 +538,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
scalar_type zscale(scale); scalar_type zscale(scale);
GridBase *grid = X._grid; GridBase *grid = X.Grid();
int Nsimd =grid->Nsimd(); int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim]; int Nblock =grid->GlobalDimensions()[orthogdim];
@ -505,8 +551,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
int e2 =grid->_slice_block [orthogdim]; int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim]; int stride =grid->_slice_stride[orthogdim];
std::vector<int> icoor; Coordinate icoor;
for(int r=0;r<rd;r++){ for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@ -522,12 +567,15 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
parallel_for_nest2(int n=0;n<e1;n++){ auto Rv=R.View();
auto Xv=X.View();
auto Yv=Y.View();
thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
R._odata[ss] = at*X._odata[ss]+Y._odata[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
}
} }
});
} }
}; };
@ -559,18 +607,18 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid; GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl = nh-1; // int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -578,28 +626,31 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2) auto X_v=X.View();
for(int n=0;n<nblock;n++){ auto Y_v=Y.View();
auto R_v=R.View();
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride]; s_x[i] = X_v[o+i*ostride];
} }
vobj dot; vobj dot;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
dot = Y[o+i*ostride]; dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){ for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i)); dot = dot + s_x[j]*(scale*aa(j,i));
} }
R[o+i*ostride]=dot; R_v[o+i*ostride]=dot;
} }
}} }});
} }
}; };
@ -610,17 +661,17 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid; GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -628,17 +679,19 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel auto R_v = R.View();
auto X_v = X.View();
thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){ thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride]; s_x[i] = X_v[o+i*ostride];
} }
vobj dot; vobj dot;
@ -647,11 +700,10 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
for(int j=1;j<Nblock;j++){ for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i)); dot = dot + s_x[j]*(scale*aa(j,i));
} }
R[o+i*ostride]=dot; R_v[o+i*ostride]=dot;
} }
}} }});
} }
}; };
@ -662,7 +714,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid; GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog]; int Nblock = FullGrid->GlobalDimensions()[Orthog];
@ -673,9 +725,9 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
int nl = nh-1; // int nl = nh-1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@ -686,31 +738,33 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
#pragma omp parallel auto lhs_v=lhs.View();
auto rhs_v=rhs.View();
thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock); Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
#pragma omp for collapse(2) thread_for_collapse_in_region( 2, n,nblock,{
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){ for(int b=0;b<block;b++){
int o = n*stride + b; int o = n*stride + b;
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
Left [i] = lhs[o+i*ostride]; Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs[o+i*ostride]; Right[i] = rhs_v[o+i*ostride];
} }
for(int i=0;i<Nblock;i++){ for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){ for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]); auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp); auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp); auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}} }}
}} }});
#pragma omp critical thread_critical
{ {
mat += mat_thread; mat += mat_thread;
} }
@ -726,8 +780,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
return; return;
} }
} /*END NAMESPACE GRID*/ NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,226 @@
NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32
extern cudaDeviceProp *gpu_props;
__device__ unsigned int retirementCount = 0;
template <class Iterator>
unsigned int nextPow2(Iterator x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
cudaGetDevice(&device);
Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
}
// let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize;
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
}
template <class sobj, class Iterator>
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
Iterator blockSize = blockDim.x;
// cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp();
const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1);
sobj beta, temp;
memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
for (int i = VEC/2; i > 0; i>>=1) {
if (vid < i) {
memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
}
__syncwarp();
}
__syncthreads();
if (threadIdx.x == 0) {
beta = Zero();
for (Iterator i = 0; i < blockSize; i += VEC) {
memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
beta += temp;
}
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
}
__syncthreads();
}
template <class vobj, class sobj, class Iterator>
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
{
constexpr Iterator nsimd = vobj::Nsimd();
Iterator blockSize = blockDim.x;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *sdata = (sobj *)shmem_pointer;
// first level of reduction,
// each thread writes result in mySum
Iterator tid = threadIdx.x;
Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
Iterator gridSize = blockSize*2*gridDim.x;
sobj mySum = Zero();
while (i < n) {
Iterator lane = i % nsimd;
Iterator ss = i / nsimd;
auto tmp = extractLane(lane,g_idata[ss]);
sobj tmpD;
tmpD=tmp;
mySum +=tmpD;
if (i + blockSize < n) {
lane = (i+blockSize) % nsimd;
ss = (i+blockSize) / nsimd;
tmp = extractLane(lane,g_idata[ss]);
tmpD = tmp;
mySum += tmpD;
}
i += gridSize;
}
// copy mySum to shared memory and perform
// reduction for all threads in this block
reduceBlock(sdata, mySum, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class vobj, class sobj,class Iterator>
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
Iterator blockSize = blockDim.x;
// perform reduction for this block and
// write result to global memory buffer
reduceBlocks(lat, buffer, n);
if (gridDim.x > 1) {
const Iterator tid = threadIdx.x;
__shared__ bool amLast;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished
__threadfence();
if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
// true if this block is the last block to be done
amLast = (ticket == gridDim.x-1);
}
// each thread must read the correct value of amLast
__syncthreads();
if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1]
Iterator i = tid;
sobj mySum = Zero();
while (i < gridDim.x) {
mySum += buffer[i];
i += blockSize;
}
reduceBlock(smem, mySum, tid);
if (tid==0) {
buffer[0] = smem[0];
// reset count variable
retirementCount = 0;
}
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0];
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
NAMESPACE_END(Grid);

View File

@ -41,7 +41,7 @@
#undef RNG_FAST_DISCARD #undef RNG_FAST_DISCARD
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid // Allow the RNG state to be less dense than the fine grid
@ -108,12 +108,16 @@ namespace Grid {
template<class distribution,class generator> template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen) void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{ {
s=ComplexF(dist(gen),dist(gen)); // s=ComplexF(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
} }
template<class distribution,class generator> template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen) void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{ {
s=ComplexD(dist(gen),dist(gen)); // s=ComplexD(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
} }
class GridRNGbase { class GridRNGbase {
@ -165,7 +169,10 @@ namespace Grid {
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init // uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30; const int shift = 30;
uint64_t skip = site; ////////////////////////////////////////////////////////////////////
// Weird compiler bug in Intel 2018.1 under O3 was generating 32bit and not 64 bit left shift.
////////////////////////////////////////////////////////////////////
volatile uint64_t skip = site;
skip = skip<<shift; skip = skip<<shift;
@ -256,7 +263,7 @@ namespace Grid {
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}; }
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){ template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset(); dist[0].reset();
@ -333,13 +340,13 @@ namespace Grid {
}; };
class GridParallelRNG : public GridRNGbase { class GridParallelRNG : public GridRNGbase {
private:
double _time_counter; double _time_counter;
public:
GridBase *_grid; GridBase *_grid;
unsigned int _vol; unsigned int _vol;
public:
GridBase *Grid(void) const { return _grid; }
int generator_idx(int os,int is) { int generator_idx(int os,int is) {
return is*_grid->oSites()+os; return is*_grid->oSites()+os;
} }
@ -363,13 +370,14 @@ namespace Grid {
double inner_time_counter = usecond(); double inner_time_counter = usecond();
int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l._grid too int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
int osites = _grid->oSites(); // guaranteed to be <= l._grid->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
parallel_for(int ss=0;ss<osites;ss++){ auto l_v = l.View();
std::vector<scalar_object> buf(Nsimd); thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int sm = multiplicity * ss + m; // Maps the generator site to the fine site int sm = multiplicity * ss + m; // Maps the generator site to the fine site
@ -383,12 +391,13 @@ namespace Grid {
fillScalar(pointer[idx], dist[gdx], _generators[gdx]); fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
} }
// merge into SIMD lanes, FIXME suboptimal implementation // merge into SIMD lanes, FIXME suboptimal implementation
merge(l._odata[sm], buf); merge(l_v[sm], buf);
}
} }
});
// });
_time_counter += usecond()- inner_time_counter; _time_counter += usecond()- inner_time_counter;
}; }
void SeedUniqueString(const std::string &s){ void SeedUniqueString(const std::string &s){
std::vector<int> seeds; std::vector<int> seeds;
@ -417,12 +426,13 @@ namespace Grid {
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Everybody loops over global volume. // Everybody loops over global volume.
parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){ thread_for( gidx, _grid->_gsites, {
// Where is it? // Where is it?
int rank,o_idx,i_idx; int rank;
std::vector<int> gcoor; int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor); _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
@ -432,8 +442,7 @@ namespace Grid {
_generators[l_idx] = master_engine; _generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
} }
});
}
#else #else
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient // Machine and thread decomposition dependent seeding is efficient
@ -459,7 +468,7 @@ namespace Grid {
seeders[t] = Reseed(master_engine); seeders[t] = Reseed(master_engine);
} }
parallel_for(int t=0;t<Nthread;t++) { thread_for( t, Nthread, {
// set up one per local site in threaded fashion // set up one per local site in threaded fashion
std::vector<uint32_t> newseeds; std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid; std::uniform_int_distribution<uint32_t> uid;
@ -468,7 +477,7 @@ namespace Grid {
_generators[l] = Reseed(seeders[t],newseeds,uid); _generators[l] = Reseed(seeders[t],newseeds,uid);
} }
} }
} });
} }
#endif #endif
} }
@ -486,8 +495,8 @@ namespace Grid {
uint32_t the_number; uint32_t the_number;
// who // who
std::vector<int> gcoor;
int rank,o_idx,i_idx; int rank,o_idx,i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor); _grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
@ -512,5 +521,5 @@ namespace Grid {
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -32,19 +32,20 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Tracing, transposing, peeking, poking // Tracing, transposing, peeking, poking
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace // Trace
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
-> Lattice<decltype(trace(lhs._odata[0]))>
{ {
Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid); Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = trace(lhs._odata[ss]); auto lhs_v = lhs.View();
} accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
});
return ret; return ret;
}; };
@ -52,16 +53,17 @@ namespace Grid {
// Trace Index level dependent operation // Trace Index level dependent operation
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]); auto lhs_v = lhs.View();
} accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
});
return ret; return ret;
}; };
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -25,10 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_TRANSFER_H #pragma once
#define GRID_LATTICE_TRANSFER_H
namespace Grid { NAMESPACE_BEGIN(Grid);
inline void subdivides(GridBase *coarse,GridBase *fine) inline void subdivides(GridBase *coarse,GridBase *fine)
{ {
@ -49,34 +48,39 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
half.checkerboard = cb; half.Checkerboard() = cb;
parallel_for(int ss=0;ss<full._grid->oSites();ss++){ auto half_v = half.View();
auto full_v = full.View();
thread_for(ss, full.Grid()->oSites(),{
int cbos; int cbos;
std::vector<int> coor; Coordinate coor;
full._grid->oCoorFromOindex(coor,ss); full.Grid()->oCoorFromOindex(coor,ss);
cbos=half._grid->CheckerBoard(coor); cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) { if (cbos==cb) {
int ssh=half._grid->oIndex(coor); int ssh=half.Grid()->oIndex(coor);
half._odata[ssh] = full._odata[ss]; half_v[ssh] = full_v[ss];
}
} }
});
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.checkerboard; int cb = half.Checkerboard();
parallel_for(int ss=0;ss<full._grid->oSites();ss++){ auto half_v = half.View();
std::vector<int> coor; auto full_v = full.View();
thread_for(ss,full.Grid()->oSites(),{
Coordinate coor;
int cbos; int cbos;
full._grid->oCoorFromOindex(coor,ss); full.Grid()->oCoorFromOindex(coor,ss);
cbos=half._grid->CheckerBoard(coor); cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) { if (cbos==cb) {
int ssh=half._grid->oIndex(coor); int ssh=half.Grid()->oIndex(coor);
full._odata[ss]=half._odata[ssh]; full_v[ss]=half_v[ssh];
}
} }
});
} }
@ -85,8 +89,8 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData, const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const std::vector<Lattice<vobj> > &Basis)
{ {
GridBase * fine = fineData._grid; GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData._grid; GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// checks // checks
@ -96,33 +100,33 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
conformable(Basis[i],fineData); conformable(Basis[i],fineData);
} }
std::vector<int> block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
} }
coarseData=zero; coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
// Loop over coars parallel, and then loop over fine associated with coarse. // Loop over coars parallel, and then loop over fine associated with coarse.
parallel_for(int sf=0;sf<fine->oSites();sf++){ thread_for( sf, fine->oSites(), {
int sc; int sc;
std::vector<int> coor_c(_ndimension); Coordinate coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL thread_critical {
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
auto Basis_ = Basis[i].View();
coarseData._odata[sc](i)=coarseData._odata[sc](i) coarseData_[sc](i)=coarseData_[sc](i) + innerProduct(Basis_[sf],fineData_[sf]);
+ innerProduct(Basis[i]._odata[sf],fineData._odata[sf]);
} }
} }
});
return; return;
} }
@ -132,18 +136,18 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
const Lattice<vobj> &fineX, const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
{ {
GridBase * fine = fineZ._grid; GridBase * fine = fineZ.Grid();
GridBase * coarse= coarseA._grid; GridBase * coarse= coarseA.Grid();
fineZ.checkerboard=fineX.checkerboard; fineZ.Checkerboard()=fineX.Checkerboard();
assert(fineX.checkerboard==fineY.checkerboard); assert(fineX.Checkerboard()==fineY.Checkerboard());
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
conformable(fineX,fineY); conformable(fineX,fineY);
conformable(fineX,fineZ); conformable(fineX,fineZ);
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
std::vector<int> block_r (_ndimension); Coordinate block_r (_ndimension);
// FIXME merge with subdivide checking routine as this is redundant // FIXME merge with subdivide checking routine as this is redundant
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
@ -151,20 +155,25 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
parallel_for(int sf=0;sf<fine->oSites();sf++){ auto fineZ_ = fineZ.View();
auto fineX_ = fineX.View();
auto fineY_ = fineY.View();
auto coarseA_= coarseA.View();
thread_for(sf, fine->oSites(), {
int sc; int sc;
std::vector<int> coor_c(_ndimension); Coordinate coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
// z = A x + y // z = A x + y
fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf]; fineZ_[sf]=coarseA_[sc]*fineX_[sf]+fineY_[sf];
} });
return; return;
} }
@ -173,26 +182,29 @@ template<class vobj,class CComplex>
const Lattice<vobj> &fineX, const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
{ {
typedef decltype(innerProduct(fineX._odata[0],fineY._odata[0])) dotp; typedef decltype(innerProduct(vobj(),vobj())) dotp;
GridBase *coarse(CoarseInner._grid); GridBase *coarse(CoarseInner.Grid());
GridBase *fine (fineX._grid); GridBase *fine (fineX.Grid());
Lattice<dotp> fine_inner(fine); fine_inner.checkerboard = fineX.checkerboard; Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.View();
auto coarse_inner_ = coarse_inner.View();
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
parallel_for(int ss=0;ss<coarse->oSites();ss++){ thread_for(ss, coarse->oSites(),{
CoarseInner._odata[ss] = coarse_inner._odata[ss]; CoarseInner_[ss] = coarse_inner_[ss];
} });
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
{ {
GridBase *coarse = ip._grid; GridBase *coarse = ip.Grid();
Lattice<vobj> zz(fineX._grid); zz=zero; zz.checkerboard=fineX.checkerboard; Lattice<vobj> zz(fineX.Grid()); zz=Zero(); zz.Checkerboard()=fineX.Checkerboard();
blockInnerProduct(ip,fineX,fineX); blockInnerProduct(ip,fineX,fineX);
ip = pow(ip,-0.5); ip = pow(ip,-0.5);
blockZAXPY(fineX,ip,fineX,zz); blockZAXPY(fineX,ip,fineX,zz);
@ -202,14 +214,14 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
template<class vobj> template<class vobj>
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
{ {
GridBase * fine = fineData._grid; GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData._grid; GridBase * coarse= coarseData.Grid();
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
std::vector<int> block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
@ -217,36 +229,36 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
// Turn this around to loop threaded over sc and interior loop // Turn this around to loop threaded over sc and interior loop
// over sf would thread better // over sf would thread better
coarseData=zero; coarseData=Zero();
parallel_region { auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View();
thread_for(sf,fine->oSites(),{
int sc; int sc;
std::vector<int> coor_c(_ndimension); Coordinate coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); Coordinate coor_f(_ndimension);
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
PARALLEL_CRITICAL thread_critical {
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf]; coarseData_[sc]=coarseData_[sc]+fineData_[sf];
}
} });
}
return; return;
} }
template<class vobj> template<class vobj>
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,std::vector<int> coor) inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
{ {
GridBase * fine = unpicked._grid; GridBase * fine = unpicked.Grid();
Lattice<vobj> zz(fine); zz.checkerboard = unpicked.checkerboard; Lattice<vobj> zz(fine); zz.Checkerboard() = unpicked.Checkerboard();
Lattice<iScalar<vInteger> > fcoor(fine); Lattice<iScalar<vInteger> > fcoor(fine);
zz = zero; zz = Zero();
picked = unpicked; picked = unpicked;
for(int d=0;d<fine->_ndimension;d++){ for(int d=0;d<fine->_ndimension;d++){
@ -262,16 +274,15 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
{ {
GridBase *coarse = ip._grid; GridBase *coarse = ip.Grid();
GridBase *fine = Basis[0]._grid; GridBase *fine = Basis[0].Grid();
int nbasis = Basis.size() ; int nbasis = Basis.size() ;
int _ndimension = coarse->_ndimension;
// checks // checks
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i]._grid,fine); conformable(Basis[i].Grid(),fine);
} }
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
@ -290,41 +301,41 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const std::vector<Lattice<vobj> > &Basis)
{ {
GridBase * fine = fineData._grid; GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData._grid; GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// checks // checks
assert( nbasis == Basis.size() ); assert( nbasis == Basis.size() );
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i]._grid,fine); conformable(Basis[i].Grid(),fine);
} }
std::vector<int> block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
parallel_region { thread_for(sf,fine->oSites(),{
int sc; int sc;
std::vector<int> coor_c(_ndimension); Coordinate coor_c(_ndimension);
std::vector<int> coor_f(_ndimension); Coordinate coor_f(_ndimension);
parallel_for_internal(int sf=0;sf<fine->oSites();sf++){
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf]; auto basis_ = Basis[i].View();
else fineData._odata[sf]=fineData._odata[sf]+coarseData._odata[sc](i)*Basis[i]._odata[sf]; if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf];
} else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf];
}
} }
});
return; return;
} }
@ -337,8 +348,8 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vvobj::scalar_object ssobj; typedef typename vvobj::scalar_object ssobj;
GridBase *ig = in._grid; GridBase *ig = in.Grid();
GridBase *og = out._grid; GridBase *og = out.Grid();
int ni = ig->_ndimension; int ni = ig->_ndimension;
int no = og->_ndimension; int no = og->_ndimension;
@ -351,16 +362,16 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
parallel_for(int idx=0;idx<ig->lSites();idx++){ thread_for(idx, ig->lSites(),{
sobj s; sobj s;
ssobj ss; ssobj ss;
std::vector<int> lcoor(ni); Coordinate lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor); ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in,lcoor); peekLocalSite(s,in,lcoor);
ss=s; ss=s;
pokeLocalSite(ss,out,lcoor); pokeLocalSite(ss,out,lcoor);
} });
} }
@ -369,8 +380,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *lg = lowDim._grid; GridBase *lg = lowDim.Grid();
GridBase *hg = higherDim._grid; GridBase *hg = higherDim.Grid();
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
@ -389,10 +400,10 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
std::vector<int> lcoor(nl); Coordinate lcoor(nl);
std::vector<int> hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
int ddl=0; int ddl=0;
hcoor[orthog] = slice; hcoor[orthog] = slice;
@ -403,7 +414,7 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDim,hcoor);
} });
} }
template<class vobj> template<class vobj>
@ -411,8 +422,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *lg = lowDim._grid; GridBase *lg = lowDim.Grid();
GridBase *hg = higherDim._grid; GridBase *hg = higherDim.Grid();
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
@ -430,10 +441,10 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
std::vector<int> lcoor(nl); Coordinate lcoor(nl);
std::vector<int> hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
int ddl=0; int ddl=0;
hcoor[orthog] = slice; hcoor[orthog] = slice;
@ -444,7 +455,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDim,lcoor);
} });
} }
@ -454,8 +465,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *lg = lowDim._grid; GridBase *lg = lowDim.Grid();
GridBase *hg = higherDim._grid; GridBase *hg = higherDim.Grid();
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
@ -471,10 +482,10 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
std::vector<int> lcoor(nl); Coordinate lcoor(nl);
std::vector<int> hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
@ -482,7 +493,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDim,hcoor);
} }
} });
} }
@ -491,8 +502,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *lg = lowDim._grid; GridBase *lg = lowDim.Grid();
GridBase *hg = higherDim._grid; GridBase *hg = higherDim.Grid();
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
@ -508,10 +519,10 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
parallel_for(int idx=0;idx<lg->lSites();idx++){ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
std::vector<int> lcoor(nl); Coordinate lcoor(nl);
std::vector<int> hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
@ -519,7 +530,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDim,lcoor);
} }
} });
} }
@ -528,8 +539,8 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *cg = coarse._grid; GridBase *cg = coarse.Grid();
GridBase *fg = fine._grid; GridBase *fg = fine.Grid();
int nd = cg->_ndimension; int nd = cg->_ndimension;
@ -537,14 +548,14 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
assert(cg->_ndimension==fg->_ndimension); assert(cg->_ndimension==fg->_ndimension);
std::vector<int> ratio(cg->_ndimension); Coordinate ratio(cg->_ndimension);
for(int d=0;d<cg->_ndimension;d++){ for(int d=0;d<cg->_ndimension;d++){
ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d]; ratio[d] = fg->_fdimensions[d]/cg->_fdimensions[d];
} }
std::vector<int> fcoor(nd); Coordinate fcoor(nd);
std::vector<int> ccoor(nd); Coordinate ccoor(nd);
for(int g=0;g<fg->gSites();g++){ for(int g=0;g<fg->gSites();g++){
fg->GlobalIndexToGlobalCoor(g,fcoor); fg->GlobalIndexToGlobalCoor(g,fcoor);
@ -567,41 +578,46 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* in_grid = in._grid; GridBase* in_grid = in.Grid();
out.resize(in_grid->lSites()); out.resize(in_grid->lSites());
int ndim = in_grid->Nd(); int ndim = in_grid->Nd();
int in_nsimd = vtype::Nsimd(); int in_nsimd = vtype::Nsimd();
std::vector<std::vector<int> > in_icoor(in_nsimd); std::vector<Coordinate > in_icoor(in_nsimd);
for(int lane=0; lane < in_nsimd; lane++){ for(int lane=0; lane < in_nsimd; lane++){
in_icoor[lane].resize(ndim); in_icoor[lane].resize(ndim);
in_grid->iCoorFromIindex(in_icoor[lane], lane); in_grid->iCoorFromIindex(in_icoor[lane], lane);
} }
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index //loop over outer index
auto in_v = in.View();
thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd); ExtractPointerArray<sobj> out_ptrs(in_nsimd);
std::vector<int> in_ocoor(ndim); Coordinate in_ocoor(ndim);
in_grid->oCoorFromOindex(in_ocoor, in_oidx); in_grid->oCoorFromOindex(in_ocoor, in_oidx);
std::vector<int> lcoor(in_grid->Nd()); Coordinate lcoor(in_grid->Nd());
for(int lane=0; lane < in_nsimd; lane++){ for(int lane=0; lane < in_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
for(int mu=0;mu<ndim;mu++){
lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu]; lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
}
int lex; int lex;
Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions); Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
assert(lex < out.size());
out_ptrs[lane] = &out[lex]; out_ptrs[lane] = &out[lex];
} }
//Unpack into those ptrs //Unpack into those ptrs
const vobj & in_vobj = in._odata[in_oidx]; const vobj & in_vobj = in_v[in_oidx];
extract1(in_vobj, out_ptrs, 0); extract(in_vobj, out_ptrs, 0);
} });
} }
template<typename vobj, typename sobj> template<typename vobj, typename sobj>
@ -617,21 +633,21 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
int ndim = in_grid->Nd(); int ndim = in_grid->Nd();
int in_nsimd = vtype::Nsimd(); int in_nsimd = vtype::Nsimd();
std::vector<std::vector<int> > in_icoor(in_nsimd); std::vector<Coordinate > in_icoor(in_nsimd);
for(int lane=0; lane < in_nsimd; lane++){ for(int lane=0; lane < in_nsimd; lane++){
in_icoor[lane].resize(ndim); in_icoor[lane].resize(ndim);
in_grid->iCoorFromIindex(in_icoor[lane], lane); in_grid->iCoorFromIindex(in_icoor[lane], lane);
} }
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index thread_for(in_oidx, in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd); std::vector<sobj*> out_ptrs(in_nsimd);
std::vector<int> in_ocoor(ndim); Coordinate in_ocoor(ndim);
in_grid->oCoorFromOindex(in_ocoor, in_oidx); in_grid->oCoorFromOindex(in_ocoor, in_oidx);
std::vector<int> lcoor(in_grid->Nd()); Coordinate lcoor(in_grid->Nd());
for(int lane=0; lane < in_nsimd; lane++){ for(int lane=0; lane < in_nsimd; lane++){
for(int mu=0;mu<ndim;mu++) for(int mu=0;mu<ndim;mu++)
@ -645,7 +661,7 @@ unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
//Unpack into those ptrs //Unpack into those ptrs
const vobj & in_vobj = in._odata[in_oidx]; const vobj & in_vobj = in._odata[in_oidx];
extract1(in_vobj, out_ptrs, 0); extract1(in_vobj, out_ptrs, 0);
} });
} }
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
@ -657,28 +673,27 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid; GridBase* grid = out.Grid();
assert(in.size()==grid->lSites()); assert(in.size()==grid->lSites());
int ndim = grid->Nd(); const int ndim = grid->Nd();
int nsimd = vtype::Nsimd(); constexpr int nsimd = vtype::Nsimd();
std::vector<std::vector<int> > icoor(nsimd); std::vector<Coordinate > icoor(nsimd);
for(int lane=0; lane < nsimd; lane++){ for(int lane=0; lane < nsimd; lane++){
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
auto out_v = out.View();
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
std::vector<sobj*> ptrs(nsimd); ExtractPointerArray<sobj> ptrs(nsimd);
std::vector<int> ocoor(ndim); Coordinate ocoor(ndim);
Coordinate lcoor(ndim);
grid->oCoorFromOindex(ocoor, oidx); grid->oCoorFromOindex(ocoor, oidx);
std::vector<int> lcoor(grid->Nd());
for(int lane=0; lane < nsimd; lane++){ for(int lane=0; lane < nsimd; lane++){
for(int mu=0;mu<ndim;mu++){ for(int mu=0;mu<ndim;mu++){
@ -692,9 +707,9 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
//pack from those ptrs //pack from those ptrs
vobj vecobj; vobj vecobj;
merge1(vecobj, ptrs, 0); merge(vecobj, ptrs, 0);
out._odata[oidx] = vecobj; out_v[oidx] = vecobj;
} });
} }
template<typename vobj, typename sobj> template<typename vobj, typename sobj>
@ -711,21 +726,21 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
int ndim = grid->Nd(); int ndim = grid->Nd();
int nsimd = vtype::Nsimd(); int nsimd = vtype::Nsimd();
std::vector<std::vector<int> > icoor(nsimd); std::vector<Coordinate > icoor(nsimd);
for(int lane=0; lane < nsimd; lane++){ for(int lane=0; lane < nsimd; lane++){
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index thread_for(oidx, grid->oSites(), {
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
std::vector<sobj*> ptrs(nsimd); std::vector<sobj*> ptrs(nsimd);
std::vector<int> ocoor(ndim); Coordinate ocoor(ndim);
grid->oCoorFromOindex(ocoor, oidx); grid->oCoorFromOindex(ocoor, oidx);
std::vector<int> lcoor(grid->Nd()); Coordinate lcoor(grid->Nd());
for(int lane=0; lane < nsimd; lane++){ for(int lane=0; lane < nsimd; lane++){
@ -742,25 +757,28 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
vobj vecobj; vobj vecobj;
merge1(vecobj, ptrs, 0); merge1(vecobj, ptrs, 0);
out._odata[oidx] = vecobj; out._odata[oidx] = vecobj;
} });
} }
//Convert a Lattice from one precision to another //Convert a Lattice from one precision to another
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
assert(out._grid->Nd() == in._grid->Nd()); {
assert(out._grid->FullDimensions() == in._grid->FullDimensions()); assert(out.Grid()->Nd() == in.Grid()->Nd());
out.checkerboard = in.checkerboard; for(int d=0;d<out.Grid()->Nd();d++){
GridBase *in_grid=in._grid; assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
GridBase *out_grid = out._grid; }
out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid();
GridBase *out_grid = out.Grid();
typedef typename VobjOut::scalar_object SobjOut; typedef typename VobjOut::scalar_object SobjOut;
typedef typename VobjIn::scalar_object SobjIn; typedef typename VobjIn::scalar_object SobjIn;
int ndim = out._grid->Nd(); int ndim = out.Grid()->Nd();
int out_nsimd = out_grid->Nsimd(); int out_nsimd = out_grid->Nsimd();
std::vector<std::vector<int> > out_icoor(out_nsimd); std::vector<Coordinate > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){ for(int lane=0; lane < out_nsimd; lane++){
out_icoor[lane].resize(ndim); out_icoor[lane].resize(ndim);
@ -770,13 +788,14 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
std::vector<SobjOut> in_slex_conv(in_grid->lSites()); std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in); unvectorizeToLexOrdArray(in_slex_conv, in);
parallel_for(uint64_t out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){ auto out_v = out.View();
std::vector<int> out_ocoor(ndim); thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx); out_grid->oCoorFromOindex(out_ocoor, out_oidx);
std::vector<SobjOut*> ptrs(out_nsimd); ExtractPointerArray<SobjOut> ptrs(out_nsimd);
std::vector<int> lcoor(out_grid->Nd()); Coordinate lcoor(out_grid->Nd());
for(int lane=0; lane < out_nsimd; lane++){ for(int lane=0; lane < out_nsimd; lane++){
for(int mu=0;mu<ndim;mu++) for(int mu=0;mu<ndim;mu++)
@ -785,8 +804,8 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions); int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
ptrs[lane] = &in_slex_conv[llex]; ptrs[lane] = &in_slex_conv[llex];
} }
merge(out._odata[out_oidx], ptrs, 0); merge(out_v[out_oidx], ptrs, 0);
} });
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -845,8 +864,8 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
assert(full_vecs>=1); assert(full_vecs>=1);
GridBase * full_grid = full[0]._grid; GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split._grid; GridBase *split_grid = split.Grid();
int ndim = full_grid->_ndimension; int ndim = full_grid->_ndimension;
int full_nproc = full_grid->_Nprocessors; int full_nproc = full_grid->_Nprocessors;
@ -855,18 +874,18 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
//////////////////////////////// ////////////////////////////////
// Checkerboard management // Checkerboard management
//////////////////////////////// ////////////////////////////////
int cb = full[0].checkerboard; int cb = full[0].Checkerboard();
split.checkerboard = cb; split.Checkerboard() = cb;
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension); assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
assert(full[n].checkerboard == cb); assert(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]); assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]); assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
@ -874,7 +893,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
assert(nvector*split_nproc==full_nproc); assert(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs); assert(nvector == full_vecs);
std::vector<int> ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
} }
@ -887,13 +906,13 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
for(int v=0;v<nvector;v++){ for(int v=0;v<nvector;v++){
unvectorizeToLexOrdArray(scalardata,full[v]); unvectorizeToLexOrdArray(scalardata,full[v]);
parallel_for(int site=0;site<lsites;site++){ thread_for(site,lsites,{
alldata[v*lsites+site] = scalardata[site]; alldata[v*lsites+site] = scalardata[site];
} });
} }
int nvec = nvector; // Counts down to 1 as we collapse dims int nvec = nvector; // Counts down to 1 as we collapse dims
std::vector<int> ldims = full_grid->_ldimensions; Coordinate ldims = full_grid->_ldimensions;
for(int d=ndim-1;d>=0;d--){ for(int d=ndim-1;d>=0;d--){
@ -919,8 +938,8 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
// Loop over reordered data post A2A // Loop over reordered data post A2A
parallel_for(int c=0;c<chunk;c++){ thread_for(c, chunk, {
std::vector<int> coor(ndim); Coordinate coor(ndim);
for(int m=0;m<M;m++){ for(int m=0;m<M;m++){
for(int s=0;s<sP;s++){ for(int s=0;s<sP;s++){
@ -942,7 +961,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
} }
} }
} });
ldims[d]*= ratio[d]; ldims[d]*= ratio[d];
lsites *= ratio[d]; lsites *= ratio[d];
@ -954,8 +973,8 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
template<class Vobj> template<class Vobj>
void Grid_split(Lattice<Vobj> &full,Lattice<Vobj> & split) void Grid_split(Lattice<Vobj> &full,Lattice<Vobj> & split)
{ {
int nvector = full._grid->_Nprocessors / split._grid->_Nprocessors; int nvector = full.Grid()->_Nprocessors / split.Grid()->_Nprocessors;
std::vector<Lattice<Vobj> > full_v(nvector,full._grid); std::vector<Lattice<Vobj> > full_v(nvector,full.Grid());
for(int n=0;n<nvector;n++){ for(int n=0;n<nvector;n++){
full_v[n] = full; full_v[n] = full;
} }
@ -971,8 +990,8 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
assert(full_vecs>=1); assert(full_vecs>=1);
GridBase * full_grid = full[0]._grid; GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split._grid; GridBase *split_grid = split.Grid();
int ndim = full_grid->_ndimension; int ndim = full_grid->_ndimension;
int full_nproc = full_grid->_Nprocessors; int full_nproc = full_grid->_Nprocessors;
@ -981,18 +1000,18 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
//////////////////////////////// ////////////////////////////////
// Checkerboard management // Checkerboard management
//////////////////////////////// ////////////////////////////////
int cb = full[0].checkerboard; int cb = full[0].Checkerboard();
split.checkerboard = cb; split.Checkerboard() = cb;
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
assert(full_grid->_ndimension==split_grid->_ndimension); assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
assert(full[n].checkerboard == cb); assert(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
assert(full[n]._grid->_gdimensions[d]==split._grid->_gdimensions[d]); assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
assert(full[n]._grid->_fdimensions[d]==split._grid->_fdimensions[d]); assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
@ -1000,7 +1019,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
assert(nvector*split_nproc==full_nproc); assert(nvector*split_nproc==full_nproc);
assert(nvector == full_vecs); assert(nvector == full_vecs);
std::vector<int> ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d]; ratio[d] = full_grid->_processors[d]/ split_grid->_processors[d];
} }
@ -1019,7 +1038,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int nvec = 1; int nvec = 1;
uint64_t rsites = split_grid->lSites(); uint64_t rsites = split_grid->lSites();
std::vector<int> rdims = split_grid->_ldimensions; Coordinate rdims = split_grid->_ldimensions;
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@ -1038,8 +1057,8 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
{ {
// Loop over reordered data post A2A // Loop over reordered data post A2A
parallel_for(int c=0;c<chunk;c++){ thread_for(c, chunk,{
std::vector<int> coor(ndim); Coordinate coor(ndim);
for(int m=0;m<M;m++){ for(int m=0;m<M;m++){
for(int s=0;s<sP;s++){ for(int s=0;s<sP;s++){
@ -1060,7 +1079,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
tmpdata[lex_c] = alldata[lex_r]; tmpdata[lex_c] = alldata[lex_r];
} }
} }
} });
} }
if ( split_grid->_processors[d] > 1 ) { if ( split_grid->_processors[d] > 1 ) {
@ -1076,14 +1095,12 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
lsites = full_grid->lSites(); lsites = full_grid->lSites();
for(int v=0;v<nvector;v++){ for(int v=0;v<nvector;v++){
// assert(v<full.size()); thread_for(site, lsites,{
parallel_for(int site=0;site<lsites;site++){
// assert(v*lsites+site < alldata.size());
scalardata[site] = alldata[v*lsites+site]; scalardata[site] = alldata[v*lsites+site];
} });
vectorizeFromLexOrdArray(scalardata,full[v]); vectorizeFromLexOrdArray(scalardata,full[v]);
} }
} }
} NAMESPACE_END(Grid);
#endif

View File

@ -33,17 +33,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
// Transpose // Transpose
/////////////////////////////////////////////// ///////////////////////////////////////////////
namespace Grid { NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose // Transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid); Lattice<vobj> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = transpose(lhs._odata[ss]); auto lhs_v = lhs.View();
} accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
});
return ret; return ret;
}; };
@ -51,13 +53,16 @@ namespace Grid {
// Index level dependent transpose // Index level dependent transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ auto ret_v = ret.View();
ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]); auto lhs_v = lhs.View();
} accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
});
return ret; return ret;
}; };
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -31,54 +31,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_UNARY_H #ifndef GRID_LATTICE_UNARY_H
#define GRID_LATTICE_UNARY_H #define GRID_LATTICE_UNARY_H
namespace Grid { NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret(rhs._grid); Lattice<obj> ret_i(rhs_i.Grid());
ret.checkerboard = rhs.checkerboard; auto rhs = rhs_i.View();
conformable(ret,rhs); auto ret = ret_i.View();
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ ret.Checkerboard() = rhs.Checkerboard();
ret._odata[ss]=pow(rhs._odata[ss],y); accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y);
});
return ret_i;
} }
return ret; template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
} Lattice<obj> ret_i(rhs_i.Grid());
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs,Integer y){ auto rhs = rhs_i.View();
Lattice<obj> ret(rhs._grid); auto ret = ret_i.View();
ret.checkerboard = rhs.checkerboard; ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); accelerator_for(ss,rhs.size(),obj::Nsimd(),{
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ coalescedWrite(ret[ss],mod(rhs(ss),y));
ret._odata[ss]=mod(rhs._odata[ss],y); });
} return ret_i;
return ret;
} }
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret(rhs._grid); Lattice<obj> ret_i(rhs_i.Grid());
ret.checkerboard = rhs.checkerboard; auto ret = ret_i.View();
conformable(ret,rhs); auto rhs = rhs_i.View();
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ ret.Checkerboard() = rhs_i.Checkerboard();
ret._odata[ss]=div(rhs._odata[ss],y); accelerator_for(ss,rhs.size(),obj::Nsimd(),{
} coalescedWrite(ret[ss],div(rhs(ss),y));
return ret; });
return ret_i;
} }
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret(rhs._grid); Lattice<obj> ret_i(rhs_i.Grid());
ret.checkerboard = rhs.checkerboard; auto rhs = rhs_i.View();
conformable(ret,rhs); auto ret = ret_i.View();
parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ ret.Checkerboard() = rhs.Checkerboard();
ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp); accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
});
return ret_i;
} }
return ret; NAMESPACE_END(Grid);
}
}
#endif #endif

View File

@ -35,7 +35,7 @@ directory
#include <cxxabi.h> #include <cxxabi.h>
#include <memory> #include <memory>
namespace Grid { NAMESPACE_BEGIN(Grid);
std::string demangle(const char* name) { std::string demangle(const char* name) {
@ -109,8 +109,9 @@ void Grid_quiesce_nodes(void) {
} }
void Grid_unquiesce_nodes(void) { void Grid_unquiesce_nodes(void) {
#ifdef GRID_COMMS_MPI #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
std::cout.clear(); std::cout.clear();
#endif #endif
} }
} NAMESPACE_END(Grid);

View File

@ -37,13 +37,12 @@
#include <execinfo.h> #include <execinfo.h>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class // Dress the output; use std::chrono for time stamping via the StopWatch class
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{ class Colours{
protected: protected:
bool is_active; bool is_active;
@ -214,6 +213,6 @@ std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp);
#define BACKTRACE() BACKTRACEFP(stdout) #define BACKTRACE() BACKTRACEFP(stdout)
NAMESPACE_END(Grid);
}
#endif #endif

View File

@ -26,8 +26,7 @@
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_BINARY_IO_H #pragma once
#define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO #define USE_MPI_IO
@ -42,8 +41,7 @@
#include <arpa/inet.h> #include <arpa/inet.h>
#include <algorithm> #include <algorithm>
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Byte reversal garbage // Byte reversal garbage
@ -91,7 +89,7 @@ class BinaryIO {
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid; GridBase *grid = lat.Grid();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
@ -111,21 +109,20 @@ class BinaryIO {
lsites = 1; lsites = 1;
} }
PARALLEL_REGION thread_region
{ {
uint32_t nersc_csum_thr = 0; uint32_t nersc_csum_thr = 0;
PARALLEL_FOR_LOOP_INTERN thread_for_in_region( local_site, lsites,
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{ {
uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++) for (uint64_t j = 0; j < size32; j++)
{ {
nersc_csum_thr = nersc_csum_thr + site_buf[j]; nersc_csum_thr = nersc_csum_thr + site_buf[j];
} }
} });
PARALLEL_CRITICAL thread_critical
{ {
nersc_csum += nersc_csum_thr; nersc_csum += nersc_csum_thr;
} }
@ -134,28 +131,25 @@ PARALLEL_CRITICAL
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{ {
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
int nd = grid->_ndimension; int nd = grid->_ndimension;
uint64_t lsites =grid->lSites(); uint64_t lsites =grid->lSites();
if (fbuf.size()==1) { if (fbuf.size()==1) {
lsites=1; lsites=1;
} }
std::vector<int> local_vol =grid->LocalDimensions(); Coordinate local_vol =grid->LocalDimensions();
std::vector<int> local_start =grid->LocalStarts(); Coordinate local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions(); Coordinate global_vol =grid->FullDimensions();
PARALLEL_REGION thread_region
{ {
std::vector<int> coor(nd); Coordinate coor(nd);
uint32_t scidac_csuma_thr=0; uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0; uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0; uint32_t site_crc=0;
PARALLEL_FOR_LOOP_INTERN thread_for_in_region( local_site, lsites,
for(uint64_t local_site=0;local_site<lsites;local_site++){ {
uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
@ -182,9 +176,9 @@ PARALLEL_FOR_LOOP_INTERN
// std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl; // std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29); scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31); scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
} });
PARALLEL_CRITICAL thread_critical
{ {
scidac_csuma^= scidac_csuma_thr; scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr; scidac_csumb^= scidac_csumb_thr;
@ -202,9 +196,9 @@ PARALLEL_CRITICAL
{ {
uint32_t * f = (uint32_t *)file_object; uint32_t * f = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t); uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for( i, count, {
f[i] = ntohl(f[i]); f[i] = ntohl(f[i]);
} });
} }
// LE must Swap and switch to host // LE must Swap and switch to host
static inline void le32toh_v(void *file_object,uint64_t bytes) static inline void le32toh_v(void *file_object,uint64_t bytes)
@ -212,13 +206,13 @@ PARALLEL_CRITICAL
uint32_t *fp = (uint32_t *)file_object; uint32_t *fp = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t); uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for(i,count,{
uint32_t f; uint32_t f;
f = fp[i]; f = fp[i];
// got network order and the network to host // got network order and the network to host
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = ntohl(f); fp[i] = ntohl(f);
} });
} }
// BE is same as network // BE is same as network
@ -226,9 +220,9 @@ PARALLEL_CRITICAL
{ {
uint64_t * f = (uint64_t *)file_object; uint64_t * f = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t); uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for( i, count, {
f[i] = Grid_ntohll(f[i]); f[i] = Grid_ntohll(f[i]);
} });
} }
// LE must swap and switch; // LE must swap and switch;
@ -236,7 +230,7 @@ PARALLEL_CRITICAL
{ {
uint64_t *fp = (uint64_t *)file_object; uint64_t *fp = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t); uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){ thread_for( i, count, {
uint64_t f,g; uint64_t f,g;
f = fp[i]; f = fp[i];
// got network order and the network to host // got network order and the network to host
@ -245,7 +239,7 @@ PARALLEL_CRITICAL
f = f >> 32; f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ; g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = Grid_ntohll(g); fp[i] = Grid_ntohll(g);
} });
} }
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Real action: // Real action:
@ -281,13 +275,13 @@ PARALLEL_CRITICAL
int nrank = grid->ProcessorCount(); int nrank = grid->ProcessorCount();
int myrank = grid->ThisRank(); int myrank = grid->ThisRank();
std::vector<int> psizes = grid->ProcessorGrid(); Coordinate psizes = grid->ProcessorGrid();
std::vector<int> pcoor = grid->ThisProcessorCoor(); Coordinate pcoor = grid->ThisProcessorCoor();
std::vector<int> gLattice= grid->GlobalDimensions(); Coordinate gLattice= grid->GlobalDimensions();
std::vector<int> lLattice= grid->LocalDimensions(); Coordinate lLattice= grid->LocalDimensions();
std::vector<int> lStart(ndim); Coordinate lStart(ndim);
std::vector<int> gStart(ndim); Coordinate gStart(ndim);
// Flatten the file // Flatten the file
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@ -546,7 +540,7 @@ PARALLEL_CRITICAL
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
@ -558,7 +552,7 @@ PARALLEL_CRITICAL
GridStopWatch timer; GridStopWatch timer;
timer.Start(); timer.Start();
parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]); thread_for(x,lsites, { munge(iodata[x], scalardata[x]); });
vectorizeFromLexOrdArray(scalardata,Umu); vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier(); grid->Barrier();
@ -582,7 +576,7 @@ PARALLEL_CRITICAL
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites(), offsetCopy = offset; uint64_t lsites = grid->lSites(), offsetCopy = offset;
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry); int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0); bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
@ -596,7 +590,7 @@ PARALLEL_CRITICAL
GridStopWatch timer; timer.Start(); GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu); unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]); thread_for(x, lsites, { munge(scalardata[x],iodata[x]); });
grid->Barrier(); grid->Barrier();
timer.Stop(); timer.Stop();
@ -619,7 +613,7 @@ PARALLEL_CRITICAL
{ {
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl; std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
offset = offsetCopy; offset = offsetCopy;
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]); thread_for(x,lsites, { munge(scalardata[x],iodata[x]); });
} }
else else
{ {
@ -637,8 +631,8 @@ PARALLEL_CRITICAL
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Read a RNG; use IOobject and lexico map to an array of state // Read a RNG; use IOobject and lexico map to an array of state
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static inline void readRNG(GridSerialRNG &serial, static inline void readRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel, GridParallelRNG &parallel_rng,
std::string file, std::string file,
uint64_t offset, uint64_t offset,
uint32_t &nersc_csum, uint32_t &nersc_csum,
@ -652,7 +646,7 @@ PARALLEL_CRITICAL
std::string format = "IEEE32BIG"; std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid; GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites(); uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@ -669,11 +663,11 @@ PARALLEL_CRITICAL
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx); parallel_rng.SetState(tmp,lidx);
} });
timer.Stop(); timer.Stop();
iodata.resize(1); iodata.resize(1);
@ -683,7 +677,7 @@ PARALLEL_CRITICAL
{ {
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin()); std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
serial.SetState(tmp,0); serial_rng.SetState(tmp,0);
} }
nersc_csum = nersc_csum + nersc_csum_tmp; nersc_csum = nersc_csum + nersc_csum_tmp;
@ -699,8 +693,8 @@ PARALLEL_CRITICAL
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Write a RNG; lexico map to an array of state and use IOobject // Write a RNG; lexico map to an array of state and use IOobject
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static inline void writeRNG(GridSerialRNG &serial, static inline void writeRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel, GridParallelRNG &parallel_rng,
std::string file, std::string file,
uint64_t offset, uint64_t offset,
uint32_t &nersc_csum, uint32_t &nersc_csum,
@ -712,7 +706,7 @@ PARALLEL_CRITICAL
const int RngStateCount = GridSerialRNG::RngStateCount; const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate; typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid; GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites(); uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
@ -727,11 +721,11 @@ PARALLEL_CRITICAL
timer.Start(); timer.Start();
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx); parallel_rng.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
} });
timer.Stop(); timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
@ -739,7 +733,7 @@ PARALLEL_CRITICAL
iodata.resize(1); iodata.resize(1);
{ {
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
serial.GetState(tmp,0); serial_rng.GetState(tmp,0);
std::copy(tmp.begin(),tmp.end(),iodata[0].begin()); std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
} }
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
@ -756,5 +750,4 @@ PARALLEL_CRITICAL
} }
}; };
} NAMESPACE_END(Grid);
#endif

View File

@ -24,8 +24,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ILDG_IO_H #pragma once
#define GRID_ILDG_IO_H
#ifdef HAVE_LIME #ifdef HAVE_LIME
#include <algorithm> #include <algorithm>
@ -43,8 +42,7 @@ extern "C" {
#include "lime.h" #include "lime.h"
} }
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
#define GRID_FIELD_NORM "FieldNormMetaData" #define GRID_FIELD_NORM "FieldNormMetaData"
#define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \ #define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \
@ -140,7 +138,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
///////////////////////////////////// /////////////////////////////////////
// Scidac Private File structure // Scidac Private File structure
///////////////////////////////////// /////////////////////////////////////
_scidacFile = scidacFile(field._grid); _scidacFile = scidacFile(field.Grid());
///////////////////////////////////// /////////////////////////////////////
// Scidac Private Record structure // Scidac Private Record structure
@ -227,10 +225,10 @@ class GridLimeReader : public BinaryIO {
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl; // std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites; uint64_t PayloadSize = sizeof(sobj) * field.Grid()->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl; // std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field._grid->_gsites<<std::endl; // std::cout << "R Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl; // std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl; // std::cout << "R file size " <<file_bytes <<std::endl;
@ -406,7 +404,7 @@ class GridLimeWriter : public BinaryIO
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Write a generic lattice field and csum // Write a generic lattice field and csum
// This routine is Collectively called by all nodes // This routine is Collectively called by all nodes
// in communicator used by the field._grid // in communicator used by the field.Grid()
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template<class vobj> template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name) void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
@ -425,8 +423,8 @@ class GridLimeWriter : public BinaryIO
// v) Continue writing scidac record. // v) Continue writing scidac record.
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
GridBase *grid = field._grid; GridBase *grid = field.Grid();
assert(boss_node == field._grid->IsBoss() ); assert(boss_node == field.Grid()->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field); FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
@ -443,7 +441,7 @@ class GridLimeWriter : public BinaryIO
} }
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl; // std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl; // std::cout << "W Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl; // std::cout << "W Payload expected " <<PayloadSize<<std::endl;
//////////////////////////////////////////////// ////////////////////////////////////////////////
@ -515,7 +513,7 @@ class ScidacWriter : public GridLimeWriter {
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord, void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0) const unsigned int recordScientificPrec = 0)
{ {
GridBase * grid = field._grid; GridBase * grid = field.Grid();
//////////////////////////////////////// ////////////////////////////////////////
// fill the Grid header // fill the Grid header
@ -557,7 +555,7 @@ class ScidacReader : public GridLimeReader {
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase * grid = field._grid; GridBase * grid = field.Grid();
//////////////////////////////////////// ////////////////////////////////////////
// fill the Grid header // fill the Grid header
@ -624,7 +622,7 @@ class IldgWriter : public ScidacWriter {
template <class vsimd> template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description) void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{ {
GridBase * grid = Umu._grid; GridBase * grid = Umu.Grid();
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj; typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@ -717,9 +715,9 @@ class IldgReader : public GridLimeReader {
typedef LorentzColourMatrixF fobj; typedef LorentzColourMatrixF fobj;
typedef LorentzColourMatrixD dobj; typedef LorentzColourMatrixD dobj;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
std::vector<int> dims = Umu._grid->FullDimensions(); Coordinate dims = Umu.Grid()->FullDimensions();
assert(dims.size()==4); assert(dims.size()==4);
@ -853,6 +851,7 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum // Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format // Since this is an ILDG reader require ILDG format
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
assert(found_ildgLFN);
assert(found_ildgBinary); assert(found_ildgBinary);
assert(found_ildgFormat); assert(found_ildgFormat);
assert(found_scidacChecksum); assert(found_scidacChecksum);
@ -930,9 +929,9 @@ class IldgReader : public GridLimeReader {
} }
}; };
}} NAMESPACE_END(Grid);
//HAVE_LIME //HAVE_LIME
#endif #endif
#endif

View File

@ -32,7 +32,7 @@ extern "C" { // for linkage
#include "lime.h" #include "lime.h"
} }
namespace Grid { NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Data representation of records that enter ILDG and SciDac formats // Data representation of records that enter ILDG and SciDac formats
@ -91,7 +91,7 @@ struct scidacFile : Serializable {
return dimensions; return dimensions;
} }
void setDimensions(std::vector<int> dimensions) { void setDimensions(Coordinate dimensions) {
char delimiter = ' '; char delimiter = ' ';
std::stringstream stream; std::stringstream stream;
for(int i=0;i<dimensions.size();i++){ for(int i=0;i<dimensions.size();i++){
@ -232,6 +232,6 @@ struct usqcdPropInfo : Serializable {
}; };
#endif #endif
} NAMESPACE_END(Grid);
#endif #endif
#endif #endif

View File

@ -36,7 +36,7 @@
#include <sys/utsname.h> #include <sys/utsname.h>
#include <pwd.h> #include <pwd.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// Precision mapping // Precision mapping
@ -52,7 +52,8 @@ namespace Grid {
format = std::string("IEEE64BIG"); format = std::string("IEEE64BIG");
} }
return format; return format;
} };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// header specification/interpretation // header specification/interpretation
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -95,10 +96,9 @@ namespace Grid {
{} {}
}; };
namespace QCD { // PB disable using namespace - this is a header and forces namesapce visibility for all
// including files
using namespace Grid; //using namespace Grid;
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data // Bit and Physical Checksumming and QA of data
@ -169,7 +169,7 @@ namespace Grid {
template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header) template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
{ {
GridBase *grid = field._grid; GridBase *grid = field.Grid();
std::string format = getFormatString<vobj>(); std::string format = getFormatString<vobj>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
@ -179,19 +179,19 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header) inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{ {
// How to convert data precision etc... // How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data); header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data); header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
} }
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header) inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{ {
// How to convert data precision etc... // How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data); header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data); header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
} }
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{ {
GridBase *grid = field._grid; GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>(); std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
@ -201,7 +201,7 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
} }
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{ {
GridBase *grid = field._grid; GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixD>(); std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
@ -325,7 +325,6 @@ struct BinarySimpleMunger {
} }
} }
}; };
}
NAMESPACE_END(Grid);
}

View File

@ -30,8 +30,7 @@
#ifndef GRID_NERSC_IO_H #ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H #define GRID_NERSC_IO_H
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
using namespace Grid; using namespace Grid;
@ -57,7 +56,6 @@ namespace Grid {
// for the header-reader // for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field) static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{ {
uint64_t offset=0;
std::map<std::string,std::string> header; std::map<std::string,std::string> header;
std::string line; std::string line;
@ -138,8 +136,8 @@ namespace Grid {
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu._grid,header); uint64_t offset = readHeader(file,Umu.Grid(),header);
FieldMetaData clone(header); FieldMetaData clone(header);
@ -190,8 +188,6 @@ namespace Grid {
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) { if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl; std::cout << " Plaquette mismatch "<<std::endl;
std::cout << Umu[0]<<std::endl;
std::cout << Umu[1]<<std::endl;
} }
if ( nersc_csum != header.checksum ) { if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl; std::cerr << " checksum mismatch " << std::endl;
@ -229,7 +225,7 @@ namespace Grid {
typedef LorentzColourMatrixD fobj3D; typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D; typedef LorentzColour2x3D fobj2D;
GridBase *grid = Umu._grid; GridBase *grid = Umu.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
@ -274,7 +270,7 @@ namespace Grid {
header.ensemble_id = "UKQCD"; header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF"; header.ensemble_label = "DWF";
GridBase *grid = parallel._grid; GridBase *grid = parallel.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
@ -321,7 +317,7 @@ namespace Grid {
{ {
typedef typename GridParallelRNG::RngStateType RngStateType; typedef typename GridParallelRNG::RngStateType RngStateType;
GridBase *grid = parallel._grid; GridBase *grid = parallel.Grid();
uint64_t offset = readHeader(file,grid,header); uint64_t offset = readHeader(file,grid,header);
@ -356,8 +352,8 @@ namespace Grid {
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl; std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
} }
}; };
}} NAMESPACE_END(QCD);
#endif #endif

View File

@ -29,7 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B) #define RawConfig(A,B) (A<<8|B)
@ -72,4 +72,5 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" }, // { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif #endif
}; };
} NAMESPACE_END(Grid);

View File

@ -47,7 +47,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <x86intrin.h> #include <x86intrin.h>
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
#ifdef __linux__ #ifdef __linux__
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
@ -89,6 +89,9 @@ inline uint64_t cyclecount(void){
return tmp; return tmp;
} }
#elif defined __x86_64__ #elif defined __x86_64__
#ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; }
#endif
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); return __rdtsc();
// unsigned int dummy; // unsigned int dummy;
@ -212,7 +215,7 @@ public:
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
ign=::read(fd, &count, sizeof(long long)); ign=::read(fd, &count, sizeof(long long));
ign+=::read(cyclefd, &cycles, sizeof(long long)); ign+=::read(cyclefd, &cycles, sizeof(long long));
assert(ign=2*sizeof(long long)); assert(ign==2*sizeof(long long));
} }
elapsed = cyclecount() - begin; elapsed = cyclecount() - begin;
#else #else
@ -241,5 +244,6 @@ public:
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -2,7 +2,7 @@
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/perfmon/Stat.h> #include <Grid/perfmon/Stat.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
bool PmuStat::pmu_initialized=false; bool PmuStat::pmu_initialized=false;
@ -242,4 +242,5 @@ void PmuStat::KNLreadctrs(ctrs &c)
} }
#endif #endif
} NAMESPACE_END(Grid);

View File

@ -5,7 +5,7 @@
#define _KNIGHTS_LANDING_ROOTONLY #define _KNIGHTS_LANDING_ROOTONLY
#endif #endif
namespace Grid { NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Extra KNL counters from MCDRAM // Extra KNL counters from MCDRAM
@ -98,7 +98,8 @@ public:
}; };
} NAMESPACE_END(Grid);
#endif #endif

View File

@ -33,11 +33,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <ctime> #include <ctime>
#include <chrono> #include <chrono>
namespace Grid { NAMESPACE_BEGIN(Grid)
// Dress the output; use std::chrono // Dress the output; use std::chrono
// C++11 time facilities better? // C++11 time facilities better?
inline double usecond(void) { inline double usecond(void) {
struct timeval tv; struct timeval tv;
@ -125,5 +123,6 @@ public:
} }
}; };
} NAMESPACE_END(Grid)
#endif #endif

View File

@ -14,7 +14,12 @@
#ifndef SOURCE_PUGIXML_CPP #ifndef SOURCE_PUGIXML_CPP
#define SOURCE_PUGIXML_CPP #define SOURCE_PUGIXML_CPP
#include <Grid/pugixml/pugixml.h> #ifdef __NVCC__
#pragma push
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#endif
#include "pugixml.h"
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
@ -202,7 +207,7 @@ PUGI__NS_BEGIN
// Without a template<> we'll get multiple definitions of the same static // Without a template<> we'll get multiple definitions of the same static
template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate; template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate; template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
template struct xml_memory_management_function_storage<int>;
typedef xml_memory_management_function_storage<int> xml_memory; typedef xml_memory_management_function_storage<int> xml_memory;
PUGI__NS_END PUGI__NS_END
@ -12768,6 +12773,10 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC
#pragma pop
#endif
#endif #endif
/** /**

View File

@ -29,55 +29,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_QCD_BASE_H #pragma once
#define GRID_QCD_BASE_H
namespace Grid{
namespace QCD {
static const int Xdir = 0; NAMESPACE_BEGIN(Grid);
static const int Ydir = 1;
static const int Zdir = 2;
static const int Tdir = 3;
static constexpr int Xdir = 0;
static constexpr int Ydir = 1;
static constexpr int Zdir = 2;
static constexpr int Tdir = 3;
static const int Xp = 0; static constexpr int Xp = 0;
static const int Yp = 1; static constexpr int Yp = 1;
static const int Zp = 2; static constexpr int Zp = 2;
static const int Tp = 3; static constexpr int Tp = 3;
static const int Xm = 4; static constexpr int Xm = 4;
static const int Ym = 5; static constexpr int Ym = 5;
static const int Zm = 6; static constexpr int Zm = 6;
static const int Tm = 7; static constexpr int Tm = 7;
static const int Nc=3; static constexpr int Nc=3;
static const int Ns=4; static constexpr int Ns=4;
static const int Nd=4; static constexpr int Nd=4;
static const int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor
static const int Nds=8; // double stored gauge field static constexpr int Nds=8; // double stored gauge field
static const int Ngp=2; // gparity index range static constexpr int Ngp=2; // gparity index range
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// QCD iMatrix types // QCD iMatrix types
// Index conventions: Lorentz x Spin x Colour // Index conventions: Lorentz x Spin x Colour
// note: static const int or constexpr will work for type deductions // note: static constexpr int or constexpr will work for type deductions
// with the intel compiler (up to version 17) // with the intel compiler (up to version 17)
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
#define ColourIndex 2 #define ColourIndex (2)
#define SpinIndex 1 #define SpinIndex (1)
#define LorentzIndex 0 #define LorentzIndex (0)
// Also should make these a named enum type // Also should make these a named enum type
static const int DaggerNo=0; static constexpr int DaggerNo=0;
static const int DaggerYes=1; static constexpr int DaggerYes=1;
static const int InverseNo=0; static constexpr int InverseNo=0;
static const int InverseYes=1; static constexpr int InverseYes=1;
// Useful traits is this a spin index // Useful traits is this a spin index
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
const int SpinorIndex = 2; const int SpinorIndex = 2;
template<typename T> struct isSpinor { template<typename T> struct isSpinor {
static const bool value = (SpinorIndex==T::TensorLevel); static constexpr bool value = (SpinorIndex==T::TensorLevel);
}; };
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ; template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
@ -382,35 +380,35 @@ namespace QCD {
////////////////////////////////////////////// //////////////////////////////////////////////
template<class vobj> template<class vobj>
void pokeColour(Lattice<vobj> &lhs, void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0))> & rhs, const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0))> & rhs,
int i) int i)
{ {
PokeIndex<ColourIndex>(lhs,rhs,i); PokeIndex<ColourIndex>(lhs,rhs,i);
} }
template<class vobj> template<class vobj>
void pokeColour(Lattice<vobj> &lhs, void pokeColour(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<ColourIndex>(lhs._odata[0],0,0))> & rhs, const Lattice<decltype(peekIndex<ColourIndex>(vobj(),0,0))> & rhs,
int i,int j) int i,int j)
{ {
PokeIndex<ColourIndex>(lhs,rhs,i,j); PokeIndex<ColourIndex>(lhs,rhs,i,j);
} }
template<class vobj> template<class vobj>
void pokeSpin(Lattice<vobj> &lhs, void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0))> & rhs, const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0))> & rhs,
int i) int i)
{ {
PokeIndex<SpinIndex>(lhs,rhs,i); PokeIndex<SpinIndex>(lhs,rhs,i);
} }
template<class vobj> template<class vobj>
void pokeSpin(Lattice<vobj> &lhs, void pokeSpin(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<SpinIndex>(lhs._odata[0],0,0))> & rhs, const Lattice<decltype(peekIndex<SpinIndex>(vobj(),0,0))> & rhs,
int i,int j) int i,int j)
{ {
PokeIndex<SpinIndex>(lhs,rhs,i,j); PokeIndex<SpinIndex>(lhs,rhs,i,j);
} }
template<class vobj> template<class vobj>
void pokeLorentz(Lattice<vobj> &lhs, void pokeLorentz(Lattice<vobj> &lhs,
const Lattice<decltype(peekIndex<LorentzIndex>(lhs._odata[0],0))> & rhs, const Lattice<decltype(peekIndex<LorentzIndex>(vobj(),0))> & rhs,
int i) int i)
{ {
PokeIndex<LorentzIndex>(lhs,rhs,i); PokeIndex<LorentzIndex>(lhs,rhs,i);
@ -499,12 +497,12 @@ namespace QCD {
// Trace lattice and non-lattice // Trace lattice and non-lattice
////////////////////////////////////////// //////////////////////////////////////////
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(lhs._odata[0]))> inline auto traceSpin(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<SpinIndex>(vobj()))>
{ {
return traceIndex<SpinIndex>(lhs); return traceIndex<SpinIndex>(lhs);
} }
template<int Index,class vobj> template<int Index,class vobj>
inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(lhs._odata[0]))> inline auto traceColour(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<ColourIndex>(vobj()))>
{ {
return traceIndex<ColourIndex>(lhs); return traceIndex<ColourIndex>(lhs);
} }
@ -527,9 +525,5 @@ namespace QCD {
Axial, 1, Axial, 1,
Tadpole, 2); Tadpole, 2);
} //namespace QCD NAMESPACE_END(Grid);
} // Grid
#endif

View File

@ -37,14 +37,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
// Abstract base interface // Abstract base interface
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/ActionCore.h> #include <Grid/qcd/action/ActionCore.h>
NAMESPACE_CHECK(ActionCore);
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Fermion actions; prevent coupling fermion.cc files to other headers // Fermion actions; prevent coupling fermion.cc files to other headers
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h> #include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_CHECK(FermionCore);
#include <Grid/qcd/action/fermion/Fermion.h> #include <Grid/qcd/action/fermion/Fermion.h>
NAMESPACE_CHECK(Fermion);
//////////////////////////////////////// ////////////////////////////////////////
// Pseudo fermion combinations for HMC // Pseudo fermion combinations for HMC
//////////////////////////////////////// ////////////////////////////////////////
#include <Grid/qcd/action/pseudofermion/PseudoFermion.h> #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
NAMESPACE_CHECK(PseudoFermion);
#endif #endif

View File

@ -32,8 +32,7 @@ directory
#ifndef ACTION_BASE_H #ifndef ACTION_BASE_H
#define ACTION_BASE_H #define ACTION_BASE_H
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
template <class GaugeField > template <class GaugeField >
class Action class Action
@ -50,7 +49,6 @@ class Action
virtual ~Action(){} virtual ~Action(){}
}; };
} NAMESPACE_END(Grid);
}
#endif // ACTION_BASE_H #endif // ACTION_BASE_H

View File

@ -31,29 +31,37 @@ directory
#define QCD_ACTION_CORE #define QCD_ACTION_CORE
#include <Grid/qcd/action/ActionBase.h> #include <Grid/qcd/action/ActionBase.h>
NAMESPACE_CHECK(ActionBase);
#include <Grid/qcd/action/ActionSet.h> #include <Grid/qcd/action/ActionSet.h>
NAMESPACE_CHECK(ActionSet);
#include <Grid/qcd/action/ActionParams.h> #include <Grid/qcd/action/ActionParams.h>
NAMESPACE_CHECK(ActionParams);
//////////////////////////////////////////// ////////////////////////////////////////////
// Gauge Actions // Gauge Actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/gauge/Gauge.h> #include <Grid/qcd/action/gauge/Gauge.h>
NAMESPACE_CHECK(Gauge);
//////////////////////////////////////////// ////////////////////////////////////////////
// Fermion prereqs // Fermion prereqs
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h> #include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_CHECK(ActionFermionCore);
//////////////////////////////////////////// ////////////////////////////////////////////
// Scalar Actions // Scalar Actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/scalar/Scalar.h> #include <Grid/qcd/action/scalar/Scalar.h>
NAMESPACE_CHECK(Scalar);
//////////////////////////////////////////// ////////////////////////////////////////////
// Utility functions // Utility functions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/utils/Metric.h> #include <Grid/qcd/utils/Metric.h>
NAMESPACE_CHECK(Metric);
#include <Grid/qcd/utils/CovariantLaplacian.h> #include <Grid/qcd/utils/CovariantLaplacian.h>
NAMESPACE_CHECK(CovariantLaplacian);

View File

@ -32,25 +32,23 @@ directory
#ifndef GRID_QCD_ACTION_PARAMS_H #ifndef GRID_QCD_ACTION_PARAMS_H
#define GRID_QCD_ACTION_PARAMS_H #define GRID_QCD_ACTION_PARAMS_H
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
// These can move into a params header and be given MacroMagic serialisation // These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams { struct GparityWilsonImplParams {
bool overlapCommsCompute; Coordinate twists;
std::vector<int> twists; GparityWilsonImplParams() : twists(Nd, 0) {};
GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){};
}; };
struct WilsonImplParams { struct WilsonImplParams {
bool overlapCommsCompute; bool overlapCommsCompute;
std::vector<Real> twist_n_2pi_L; AcceleratorVector<Real,Nd> twist_n_2pi_L;
std::vector<Complex> boundary_phases; AcceleratorVector<Complex,Nd> boundary_phases;
WilsonImplParams() : overlapCommsCompute(false) { WilsonImplParams() {
boundary_phases.resize(Nd, 1.0); boundary_phases.resize(Nd, 1.0);
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
}; };
WilsonImplParams(const std::vector<Complex> phi) : boundary_phases(phi), overlapCommsCompute(false) { WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
} }
}; };
@ -88,11 +86,6 @@ namespace QCD {
BoundsCheckFreq(_BoundsCheckFreq){}; BoundsCheckFreq(_BoundsCheckFreq){};
}; };
NAMESPACE_END(Grid);
}
}
#endif #endif

View File

@ -30,10 +30,7 @@ directory
#ifndef ACTION_SET_H #ifndef ACTION_SET_H
#define ACTION_SET_H #define ACTION_SET_H
namespace Grid { NAMESPACE_BEGIN(Grid);
// Should drop this namespace here
namespace QCD {
////////////////////////////////// //////////////////////////////////
// Indexing of tuple types // Indexing of tuple types
@ -87,7 +84,7 @@ struct ActionLevel {
void push_back(Action<GenField>* ptr) { void push_back(Action<GenField>* ptr) {
// insert only in the correct vector // insert only in the correct vector
std::get< Index < GenField, action_hirep_types>::value >(actions_hirep).push_back(ptr); std::get< Index < GenField, action_hirep_types>::value >(actions_hirep).push_back(ptr);
}; }
template <class ActPtr> template <class ActPtr>
static void resize(ActPtr ap, unsigned int n) { static void resize(ActPtr ap, unsigned int n) {
@ -110,7 +107,6 @@ struct ActionLevel {
template <class GaugeField, class R> template <class GaugeField, class R>
using ActionSet = std::vector<ActionLevel<GaugeField, R> >; using ActionSet = std::vector<ActionLevel<GaugeField, R> >;
} // QCD NAMESPACE_END(Grid);
} // Grid
#endif // ACTION_SET_H #endif // ACTION_SET_H

View File

@ -32,8 +32,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Grid/qcd/action/fermion/CayleyFermion5D.h> #include <Grid/qcd/action/fermion/CayleyFermion5D.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
// DJM: Abstract base class for EOFA fermion types. // DJM: Abstract base class for EOFA fermion types.
// Defines layout of additional EOFA-specific parameters and operators. // Defines layout of additional EOFA-specific parameters and operators.
@ -95,6 +94,7 @@ namespace QCD {
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
}; };
}; };
}}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -26,39 +26,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_QCD_CAYLEY_FERMION_H #pragma once
#define GRID_QCD_CAYLEY_FERMION_H
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> #include <Grid/qcd/action/fermion/WilsonFermion5D.h>
namespace Grid { NAMESPACE_BEGIN(Grid);
namespace QCD {
template<typename T> struct switcheroo {
static inline int iscomplex() { return 0; }
template<class vec>
static inline vec mult(vec a, vec b) {
return real_mult(a,b);
}
};
template<> struct switcheroo<ComplexD> {
static inline int iscomplex() { return 1; }
template<class vec>
static inline vec mult(vec a, vec b) {
return a*b;
}
};
template<> struct switcheroo<ComplexF> {
static inline int iscomplex() { return 1; }
template<class vec>
static inline vec mult(vec a, vec b) {
return a*b;
}
};
template<class Impl> template<class Impl>
class CayleyFermion5D : public WilsonFermion5D<Impl> class CayleyFermion5D : public WilsonFermion5D<Impl>
@ -110,29 +82,16 @@ namespace Grid {
void M5D(const FermionField &psi, void M5D(const FermionField &psi,
const FermionField &phi, const FermionField &phi,
FermionField &chi, FermionField &chi,
std::vector<Coeff_t> &lower, Vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag, Vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper); Vector<Coeff_t> &upper);
void M5Ddag(const FermionField &psi, void M5Ddag(const FermionField &psi,
const FermionField &phi, const FermionField &phi,
FermionField &chi, FermionField &chi,
std::vector<Coeff_t> &lower, Vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag, Vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper); Vector<Coeff_t> &upper);
void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd> > & Matp, Vector<iSinglet<Simd> > & Matm);
void MooeeInternalAsm(const FermionField &in, FermionField &out,
int LLs, int site,
Vector<iSinglet<Simd> > &Matp,
Vector<iSinglet<Simd> > &Matm);
void MooeeInternalZAsm(const FermionField &in, FermionField &out,
int LLs, int site,
Vector<iSinglet<Simd> > &Matp,
Vector<iSinglet<Simd> > &Matm);
virtual void Instantiatable(void)=0; virtual void Instantiatable(void)=0;
@ -151,29 +110,29 @@ namespace Grid {
RealD mass; RealD mass;
// Save arguments to SetCoefficientsInternal // Save arguments to SetCoefficientsInternal
std::vector<Coeff_t> _gamma; Vector<Coeff_t> _gamma;
RealD _zolo_hi; RealD _zolo_hi;
RealD _b; RealD _b;
RealD _c; RealD _c;
// Cayley form Moebius (tanh and zolotarev) // Cayley form Moebius (tanh and zolotarev)
std::vector<Coeff_t> omega; Vector<Coeff_t> omega;
std::vector<Coeff_t> bs; // S dependent coeffs Vector<Coeff_t> bs; // S dependent coeffs
std::vector<Coeff_t> cs; Vector<Coeff_t> cs;
std::vector<Coeff_t> as; Vector<Coeff_t> as;
// For preconditioning Cayley form // For preconditioning Cayley form
std::vector<Coeff_t> bee; Vector<Coeff_t> bee;
std::vector<Coeff_t> cee; Vector<Coeff_t> cee;
std::vector<Coeff_t> aee; Vector<Coeff_t> aee;
std::vector<Coeff_t> beo; Vector<Coeff_t> beo;
std::vector<Coeff_t> ceo; Vector<Coeff_t> ceo;
std::vector<Coeff_t> aeo; Vector<Coeff_t> aeo;
// LDU factorisation of the eeoo matrix // LDU factorisation of the eeoo matrix
std::vector<Coeff_t> lee; Vector<Coeff_t> lee;
std::vector<Coeff_t> leem; Vector<Coeff_t> leem;
std::vector<Coeff_t> uee; Vector<Coeff_t> uee;
std::vector<Coeff_t> ueem; Vector<Coeff_t> ueem;
std::vector<Coeff_t> dee; Vector<Coeff_t> dee;
// Matrices of 5d ee inverse params // Matrices of 5d ee inverse params
Vector<iSinglet<Simd> > MatpInv; Vector<iSinglet<Simd> > MatpInv;
@ -189,8 +148,6 @@ namespace Grid {
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
void CayleyReport(void); void CayleyReport(void);
void CayleyZeroCounters(void); void CayleyZeroCounters(void);
@ -205,22 +162,8 @@ namespace Grid {
protected: protected:
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c); virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
}; };
} NAMESPACE_END(Grid);
}
#define INSTANTIATE_DPERP(A)\
template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
template void CayleyFermion5D< A >::MooeeInv (const FermionField &psi, FermionField &chi); \
template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
#undef CAYLEY_DPERP_DENSE
#define CAYLEY_DPERP_CACHE
#undef CAYLEY_DPERP_LINALG
#define CAYLEY_DPERP_VEC
#endif

Some files were not shown because too many files have changed in this diff Show More