1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-09-19 09:41:05 +01:00

Compare commits

..

1 Commits

Author SHA1 Message Date
dbollweg
da81a73b4a Merge 461cd045c6 into da59379612 2024-03-27 02:11:32 +09:00
459 changed files with 7694 additions and 27357 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,2 +0,0 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -1,5 +0,0 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench

View File

@@ -1,2 +0,0 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -51,13 +51,11 @@ directory
#pragma nv_diag_suppress cast_to_qualified_type #pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored #pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress declared_but_not_referenced
#pragma nv_diag_suppress extra_semicolon #pragma nv_diag_suppress extra_semicolon
#else #else
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero #pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type #pragma diag_suppress cast_to_qualified_type
#pragma diag_suppress declared_but_not_referenced
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored #pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon #pragma diag_suppress extra_semicolon

View File

@@ -59,7 +59,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice.h> #include <Grid/lattice/Lattice.h>
#include <Grid/cshift/Cshift.h> #include <Grid/cshift/Cshift.h>
#include <Grid/stencil/Stencil.h> #include <Grid/stencil/Stencil.h>
#include <Grid/stencil/GeneralLocalStencil.h>
#include <Grid/parallelIO/BinaryIO.h> #include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h> #include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore) NAMESPACE_CHECK(GridCore)

View File

@@ -1,17 +1,9 @@
#ifndef GRID_STD_H #ifndef GRID_STD_H
#define GRID_STD_H #define GRID_STD_H
///////////////////
// Grid config
///////////////////
#include "Config.h"
/////////////////// ///////////////////
// Std C++ dependencies // Std C++ dependencies
/////////////////// ///////////////////
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory> #include <memory>
@@ -23,9 +15,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <random> #include <random>
#include <functional> #include <functional>
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include <strings.h> #include <strings.h>
#include <stdio.h> #include <stdio.h>
#include <signal.h> #include <signal.h>
@@ -33,35 +23,11 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <sys/time.h> #include <sys/time.h>
#include <chrono> #include <chrono>
#include <zlib.h> #include <zlib.h>
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
void GridAbort(void);
#define ASSLOG(A) ::write(STDERR_FILENO,A,::strlen(A));
#ifdef HAVE_EXECINFO_H
#define GRID_ASSERT(b) if(!(b)) { \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE); \
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,STDERR_FILENO); \
GridAbort(); \
};
#else
#define GRID_ASSERT(b) if(!(b)) { \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
GridAbort(); \
};
#endif
///////////////////
// Grid config
///////////////////
#include "Config.h"
#ifdef TOFU #ifdef TOFU
#undef GRID_COMMS_THREADS #undef GRID_COMMS_THREADS

View File

@@ -68,10 +68,8 @@ if BUILD_FERMION_REPS
endif endif
if BUILD_SP if BUILD_SP
extra_sources+=$(SP_FERMION_FILES) extra_sources+=$(SP_FERMION_FILES)
if BUILD_FERMION_REPS
extra_sources+=$(SP_TWOIND_FERMION_FILES) extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif endif
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a

View File

@@ -29,7 +29,6 @@ directory
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include <exception>
#include <cassert> #include <cassert>
#define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_BEGIN(A) namespace A {
@@ -37,7 +36,3 @@ directory
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid) #define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid) #define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" ); #define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );
#define EXCEPTION_CHECK_BEGIN(A) try {
#define EXCEPTION_CHECK_END(A) } catch ( std::exception e ) { BACKTRACEFP(stderr); std::cerr << __PRETTY_FUNCTION__ << " : " <<__LINE__<< " Caught exception "<<e.what()<<std::endl; throw; }

View File

@@ -29,9 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H #ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H
NAMESPACE_CHECK(blas);
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_CHECK(algorithms); NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h> #include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
@@ -47,13 +44,7 @@ NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/RemezGeneral.h> #include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h> #include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx); NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/Deflation.h> #include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
// Not really deflation, but useful
#include <Grid/algorithms/blas/MomentumProject.h>
NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad); NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h> #include <Grid/algorithms/iterative/BiCGSTAB.h>
@@ -76,11 +67,10 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h> #include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
#include <Grid/algorithms/iterative/AdefGeneric.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
NAMESPACE_CHECK(PowerMethod); NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/multigrid/MultiGrid.h> #include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(multigrid); NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
#endif #endif

View File

@@ -56,6 +56,243 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
blockSum(CoarseInner,fine_inner_msk); blockSum(CoarseInner,fine_inner_msk);
} }
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Generate a full sequence of Chebyshevs
{
lo=filterlo;
noise=Mn;
FineField T0(FineGrid); T0 = noise;
FineField T1(FineGrid);
FineField T2(FineGrid);
FineField y(FineGrid);
FineField *Tnm = &T0;
FineField *Tn = &T1;
FineField *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
hermop.HermOp(T0,y);
T1=y*xscale+noise*mscale;
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
hermop.HermOp(*Tn,y);
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
// Possible more fine grained control is needed than a linear sweep,
// but huge productivity gain if this is simple algorithm and not a tunable
int m =1;
if ( n>=ordermin ) m=n-ordermin;
if ( (m%orderstep)==0 ) {
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Cycle pointers to avoid copies
FineField *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
assert(b==nn);
}
};
// Fine Object == (per site) type of fine field // Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors // nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
@@ -99,7 +336,7 @@ public:
CoarseMatrix AselfInvEven; CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd; CoarseMatrix AselfInvOdd;
deviceVector<RealD> dag_factor; Vector<RealD> dag_factor;
/////////////////////// ///////////////////////
// Interface // Interface
@@ -124,13 +361,9 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); Vector<Aview> AcceleratorViewContainer;
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) { for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -165,7 +398,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}; };
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -194,14 +427,9 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -210,10 +438,10 @@ public:
int osites=Grid()->oSites(); int osites=Grid()->oSites();
deviceVector<int> points(geom.npoint); Vector<int> points(geom.npoint, 0);
for(int p=0; p<geom.npoint; p++) { for(int p=0; p<geom.npoint; p++)
acceleratorPut(points[p],geom.points_dagger[p]); points[p] = geom.points_dagger[p];
}
auto points_p = &points[0]; auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0]; RealD* dag_factor_p = &dag_factor[0];
@@ -245,7 +473,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
} }
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
@@ -260,14 +488,8 @@ public:
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
@@ -300,7 +522,7 @@ public:
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
} }
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{ {
@@ -309,7 +531,7 @@ public:
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
std::cout <<"MdirAll out size "<< out.size()<<std::endl; std::cout <<"MdirAll out size "<< out.size()<<std::endl;
std::cout <<"MdirAll ndir "<< ndir<<std::endl; std::cout <<"MdirAll ndir "<< ndir<<std::endl;
GRID_ASSERT(0); assert(0);
} }
for(int p=0;p<ndir;p++){ for(int p=0;p<ndir;p++){
MdirCalc(in,out[p],p); MdirCalc(in,out[p],p);
@@ -373,7 +595,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
GRID_ASSERT(in.Checkerboard() == Even); assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd; out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag); DhopInternal(StencilEven, Aodd, in, out, dag);
@@ -383,7 +605,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
GRID_ASSERT(in.Checkerboard() == Odd); assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even; out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag); DhopInternal(StencilOdd, Aeven, in, out, dag);
@@ -391,7 +613,7 @@ public:
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even); assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr; CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) { if(in.Grid()->_isCheckerBoarded) {
@@ -406,7 +628,7 @@ public:
Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag); DselfInternal(Stencil, *Aself, in, out, dag);
} }
GRID_ASSERT(Aself != nullptr); assert(Aself != nullptr);
} }
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a, void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
@@ -484,20 +706,14 @@ public:
// determine in what order we need the points // determine in what order we need the points
int npoint = geom.npoint-1; int npoint = geom.npoint-1;
deviceVector<int> points(npoint); Vector<int> points(npoint, 0);
for(int p=0; p<npoint; p++) { for(int p=0; p<npoint; p++)
int val = (dag && !hermitian) ? geom.points_dagger[p] : p; points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0]; auto points_p = &points[0];
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); Vector<Aview> AcceleratorViewContainer;
hostVector<Aview> hAcceleratorViewContainer(geom.npoint); for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -560,7 +776,7 @@ public:
}); });
} }
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
} }
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@@ -611,13 +827,11 @@ public:
} }
// GPU readable prefactor // GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, { thread_for(i, nbasis*nbasis, {
int j = i/nbasis; int j = i/nbasis;
int k = i%nbasis; int k = i%nbasis;
h_dag_factor[i] = dag_factor_eigen(j, k); dag_factor[i] = dag_factor_eigen(j, k);
}); });
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
} }
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -697,7 +911,7 @@ public:
evenmask = where(mod(bcb,2)==(Integer)0,one,zero); evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask; oddmask = one-evenmask;
GRID_ASSERT(self_stencil!=-1); assert(self_stencil!=-1);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){

View File

@@ -28,15 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_FFT_H_ #ifndef _GRID_FFT_H_
#define _GRID_FFT_H_ #define _GRID_FFT_H_
#ifdef GRID_CUDA
#include <cufft.h>
#endif
#ifdef GRID_HIP
#include <hipfft/hipfft.h>
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
#if defined(USE_MKL) || defined(GRID_SYCL) #if defined(USE_MKL) || defined(GRID_SYCL)
#include <fftw/fftw3.h> #include <fftw/fftw3.h>
@@ -44,190 +35,88 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <fftw3.h> #include <fftw3.h>
#endif #endif
#endif #endif
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifndef FFTW_FORWARD template<class scalar> struct FFTW { };
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#define FFTW_ESTIMATE (0)
#endif
template<class scalar> struct FFTW {
};
#ifdef GRID_HIP
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftDoubleComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_Z2Z,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecZ2Z(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecZ2Z(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef hipfftComplex FFTW_scalar;
typedef hipfftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
auto rv = hipfftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,HIPFFT_C2C,howmany);
GRID_ASSERT(rv==HIPFFT_SUCCESS);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
hipfftResult rv;
if ( sign == forward ) rv =hipfftExecC2C(p,in,out,HIPFFT_FORWARD);
else rv =hipfftExecC2C(p,in,out,HIPFFT_BACKWARD);
accelerator_barrier();
GRID_ASSERT(rv==HIPFFT_SUCCESS);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
hipfftDestroy(p);
}
};
#endif
#ifdef GRID_CUDA
template<> struct FFTW<ComplexD> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftDoubleComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_Z2Z,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecZ2Z(p,in,out,CUFFT_FORWARD);
else cufftExecZ2Z(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
template<> struct FFTW<ComplexF> {
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
typedef cufftComplex FFTW_scalar;
typedef cufftHandle FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed,
int istride, int idist,
FFTW_scalar *out, int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
FFTW_plan p;
cufftPlanMany(&p,rank,n,n,istride,idist,n,ostride,odist,CUFFT_C2C,howmany);
return p;
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) {
if ( sign == forward ) cufftExecC2C(p,in,out,CUFFT_FORWARD);
else cufftExecC2C(p,in,out,CUFFT_INVERSE);
accelerator_barrier();
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
cufftDestroy(p);
}
};
#endif
#if !defined(GRID_CUDA) && !defined(GRID_HIP)
#ifdef HAVE_FFTW #ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> { template<> struct FFTW<ComplexD> {
public: public:
typedef fftw_complex FFTW_scalar; typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan; typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed, static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, int *onembed, FFTW_scalar *out, const int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out); ::fftw_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p); ::fftw_destroy_plan(p);
} }
}; };
template<> struct FFTW<ComplexF> { template<> struct FFTW<ComplexF> {
public: public:
typedef fftwf_complex FFTW_scalar; typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan; typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, int *n,int howmany,
FFTW_scalar *in, int *inembed, static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist, int istride, int idist,
FFTW_scalar *out, int *onembed, FFTW_scalar *out, const int *onembed,
int ostride, int odist, int ostride, int odist,
int sign, unsigned flags) { int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
} }
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out, int sign) { static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out); ::fftwf_execute_dft(p,in,out);
} }
inline static void fftw_destroy_plan(const FFTW_plan p) { inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p); ::fftwf_destroy_plan(p);
} }
}; };
#endif #endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#endif #endif
class FFT { class FFT {
private: private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops; double flops;
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
public: public:
static const int forward=FFTW_FORWARD; static const int forward=FFTW_FORWARD;
@@ -237,25 +126,31 @@ public:
double MFlops(void) {return flops/usec;} double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;} double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{ {
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid);
}; };
~FFT ( void) { ~FFT ( void) {
// delete sgrid; delete sgrid;
} }
template<class vobj> template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){ void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
// vgrid=result.Grid(); conformable(result.Grid(),vgrid);
// conformable(result.Grid(),vgrid); conformable(source.Grid(),vgrid);
// conformable(source.Grid(),vgrid); Lattice<vobj> tmp(vgrid);
const int Ndim = source.Grid()->Nd(); tmp = source;
Lattice<vobj> tmp = source; for(int d=0;d<Nd;d++){
for(int d=0;d<Ndim;d++){
if( mask[d] ) { if( mask[d] ) {
FFT_dim(result,tmp,d,sign); FFT_dim(result,tmp,d,sign);
tmp=result; tmp=result;
@@ -265,70 +160,59 @@ public:
template<class vobj> template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
const int Ndim = source.Grid()->Nd(); Coordinate mask(Nd,1);
Coordinate mask(Ndim,1);
FFT_dim_mask(result,source,mask,sign); FFT_dim_mask(result,source,mask,sign);
} }
template<class vobj> template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
const int Ndim = source.Grid()->Nd(); #ifndef HAVE_FFTW
GridBase *grid = source.Grid(); assert(0);
conformable(result.Grid(),source.Grid()); #else
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = grid->_ldimensions[dim]; int L = vgrid->_ldimensions[dim];
int G = grid->_fdimensions[dim]; int G = vgrid->_fdimensions[dim];
Coordinate layout(Ndim,1); Coordinate layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
//std::cout << "CPU view" << std::endl; Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar); int Ncomp = sizeof(sobj)/sizeof(scalar);
int64_t Nlow = 1; int Nlow = 1;
int64_t Nhigh = 1;
for(int d=0;d<dim;d++){ for(int d=0;d<dim;d++){
Nlow*=grid->_ldimensions[d]; Nlow*=vgrid->_ldimensions[d];
} }
for(int d=dim+1;d<Ndim;d++){
Nhigh*=grid->_ldimensions[d];
}
int64_t Nperp=Nlow*Nhigh;
deviceVector<scalar> pgbuf; // Layout is [perp][component][dim]
pgbuf.resize(Nperp*Ncomp*G);
scalar *pgbuf_v = &pgbuf[0];
int rank = 1; /* 1d transforms */ int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */ int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp * Nperp; int howmany = Ncomp;
int odist,idist,istride,ostride; int odist,idist,istride,ostride;
idist = odist = G; /* Distance between consecutive FT's */ idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = 1; /* Distance between two elements in the same FT */ istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n; int *inembed = n, *onembed = n;
scalar div; scalar div;
if ( sign == backward ) div = 1.0/G; if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else GRID_ASSERT(0); else assert(0);
double t_pencil=0;
double t_fft =0;
double t_total =-usecond();
// std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
/*
*
*/
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -342,154 +226,68 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
// std::cout << GridLogPerformance<<"Making pencil" << std::endl; Coordinate lcoor(Nd), gcoor(Nd);
Coordinate lcoor(Ndim), gcoor(Ndim);
double t_copy=0;
double t_shift=0;
t_pencil = -usecond();
result = source; result = source;
int pc = grid->_processor_coor[dim]; int pc = processor_coor[dim];
const Coordinate ldims = grid->_ldimensions;
const Coordinate rdims = grid->_rdimensions;
const Coordinate sdims = grid->_simd_layout;
Coordinate processors = grid->_processors;
Coordinate pgdims(Ndim);
pgdims[0] = G;
for(int d=0, dd=1;d<Ndim;d++){
if ( d!=dim ) pgdims[dd++] = ldims[d];
}
int64_t pgvol=1;
for(int d=0;d<Ndim;d++) pgvol*=pgdims[d];
const int Nsimd = vobj::Nsimd();
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
t_copy-=usecond();
autoView(r_v,result,AcceleratorRead);
accelerator_for(idx, grid->oSites(), vobj::Nsimd(), {
#ifdef GRID_SIMT
{ {
int lane=acceleratorSIMTlane(Nsimd); // buffer lane autoView(r_v,result,CpuRead);
#else autoView(p_v,pgbuf,CpuWrite);
for(int lane=0;lane<Nsimd;lane++) { thread_for(idx, sgrid->lSites(),{
#endif Coordinate cbuf(Nd);
Coordinate icoor; sobj s;
Coordinate ocoor; sgrid->LocalIndexToLocalCoor(idx,cbuf);
Coordinate pgcoor; peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
Lexicographic::CoorFromIndex(icoor,lane,sdims); pokeLocalSite(s,p_v,cbuf);
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + ((pc+p)%processors[dim])*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
}
}
// Map coordinates in lattice layout to FFTW index
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
vector_type *from = (vector_type *)&r_v[idx];
scalar_type stmp;
for(int w=0;w<Ncomp;w++){
int64_t pg_idx = pgidx + w*pgvol;
stmp = getlane(from[w], lane);
pgbuf_v[pg_idx] = stmp;
}
#ifdef GRID_SIMT
}
#else
}
#endif
}); });
}
t_copy+=usecond();
if (p != processors[dim] - 1) { if (p != processors[dim] - 1) {
Lattice<vobj> temp(grid); result = Cshift(result,dim,L);
t_shift-=usecond();
temp = Cshift(result,dim,L); result = temp;
t_shift+=usecond();
} }
} }
t_pencil += usecond();
FFTW_scalar *in = (FFTW_scalar *)pgbuf_v; // Loop over orthog coords
FFTW_scalar *out= (FFTW_scalar *)pgbuf_v; int NN=pencil_g.lSites();
t_fft = -usecond(); GridStopWatch timer;
FFTW<scalar>::fftw_execute_dft(p,in,out,sign); timer.Start();
t_fft += usecond(); thread_for( idx,NN,{
Coordinate cbuf(Nd);
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
timer.Stop();
// performance counting // performance counting
flops_call = 5.0*howmany*G*log2(G); double add,mul,fma;
usec = t_fft; FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops= flops_call; flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
result = Zero(); // writing out result
double t_insert = -usecond();
{ {
autoView(r_v,result,AcceleratorWrite); autoView(pgbuf_v,pgbuf,CpuRead);
accelerator_for(idx,grid->oSites(),Nsimd,{ autoView(result_v,result,CpuWrite);
#ifdef GRID_SIMT thread_for(idx,sgrid->lSites(),{
{ Coordinate clbuf(Nd), cgbuf(Nd);
int lane=acceleratorSIMTlane(Nsimd); // buffer lane sobj s;
#else sgrid->LocalIndexToLocalCoor(idx,clbuf);
for(int lane=0;lane<Nsimd;lane++) { cgbuf = clbuf;
#endif cgbuf[dim] = clbuf[dim]+L*pc;
Coordinate icoor(Ndim); peekLocalSite(s,pgbuf_v,cgbuf);
Coordinate ocoor(Ndim); pokeLocalSite(s,result_v,clbuf);
Coordinate pgcoor(Ndim);
Lexicographic::CoorFromIndex(icoor,lane,sdims);
Lexicographic::CoorFromIndex(ocoor,idx,rdims);
pgcoor[0] = ocoor[dim] + icoor[dim]*rdims[dim] + pc*L;
for(int d=0,dd=1;d<Ndim;d++){
if ( d!=dim ) {
pgcoor[dd] = ocoor[d] + icoor[d]*rdims[d];
dd++;
}
}
// Map coordinates in lattice layout to FFTW index
int64_t pgidx;
Lexicographic::IndexFromCoor(pgcoor,pgidx,pgdims);
vector_type *to = (vector_type *)&r_v[idx];
scalar_type stmp;
for(int w=0;w<Ncomp;w++){
int64_t pg_idx = pgidx + w*pgvol;
stmp = pgbuf_v[pg_idx];
putlane(to[w], stmp, lane);
}
#ifdef GRID_SIMT
}
#else
}
#endif
}); });
} }
result = result*div; result = result*div;
t_insert +=usecond();
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif
t_total +=usecond();
std::cout <<GridLogPerformance<< " FFT took "<<t_total/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT pencil "<<t_pencil/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which copy "<<t_copy/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " of which shift"<<t_shift/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT kernels "<<t_fft/1.0e6 <<" s" << std::endl;
std::cout <<GridLogPerformance<< " FFT insert "<<t_insert/1.0e6 <<" s" << std::endl;
} }
}; };

View File

@@ -64,7 +64,7 @@ public:
// //
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm // I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases // while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an GRID_ASSERT trap in the non-herm. This isn't right; there must be a better C++ way to // with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes // do it, but I fear it required multiple inheritance and mixed in abstract base classes
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@@ -103,38 +103,6 @@ public:
_Mat.MdagM(in,out); _Mat.MdagM(in,out);
} }
}; };
template<class Matrix,class Field>
class MMdagLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MMdag(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.MMdag(in,out);
}
};
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother // Construct herm op and shift it for mgrid smoother
@@ -148,22 +116,22 @@ public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
GRID_ASSERT(0); assert(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
GRID_ASSERT(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
GRID_ASSERT(0); assert(0);
}; };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
GRID_ASSERT(0); assert(0);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
GRID_ASSERT(0); assert(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out); HermOp(in,out);
@@ -177,44 +145,6 @@ public:
} }
}; };
////////////////////////////////////////////////////////////////////
// Create a shifted HermOp
////////////////////////////////////////////////////////////////////
template<class Field>
class ShiftedHermOpLinearOperator : public LinearOperatorBase<Field> {
LinearOperatorBase<Field> &_Mat;
RealD _shift;
public:
ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
GRID_ASSERT(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
GRID_ASSERT(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
GRID_ASSERT(0);
};
void Op (const Field &in, Field &out){
HermOp(in,out);
}
void AdjOp (const Field &in, Field &out){
HermOp(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.HermOp(in,out);
out = out + _shift*in;
}
};
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Wrap an already herm matrix // Wrap an already herm matrix
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
@@ -271,42 +201,10 @@ public:
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GRID_ASSERT(0); assert(0);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
GRID_ASSERT(0); assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GRID_ASSERT(0);
}
void HermOp(const Field &in, Field &out){
GRID_ASSERT(0);
} }
}; };
@@ -345,13 +243,13 @@ class SchurOperatorBase : public LinearOperatorBase<Field> {
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
GRID_ASSERT(0); // must coarsen the unpreconditioned system assert(0); // must coarsen the unpreconditioned system
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
GRID_ASSERT(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
GRID_ASSERT(0); assert(0);
}; };
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
@@ -447,10 +345,10 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
GRID_ASSERT(0); assert(0);
} }
virtual void HermOp(const Field& in, Field& out) { virtual void HermOp(const Field& in, Field& out) {
GRID_ASSERT(0); assert(0);
} }
void Op(const Field& in, Field& out) { void Op(const Field& in, Field& out) {
Mpc(in, out); Mpc(in, out);
@@ -460,13 +358,13 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) { void OpDiag(const Field& in, Field& out) {
GRID_ASSERT(0); // must coarsen the unpreconditioned system assert(0); // must coarsen the unpreconditioned system
} }
void OpDir(const Field& in, Field& out, int dir, int disp) { void OpDir(const Field& in, Field& out, int dir, int disp) {
GRID_ASSERT(0); assert(0);
} }
void OpDirAll(const Field& in, std::vector<Field>& out){ void OpDirAll(const Field& in, std::vector<Field>& out){
GRID_ASSERT(0); assert(0);
}; };
}; };
@@ -580,7 +478,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
public: public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
GRID_ASSERT( _Mat.isTrivialEE() ); assert( _Mat.isTrivialEE() );
mass = _Mat.Mass(); mass = _Mat.Mass();
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
@@ -611,7 +509,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out) { virtual void MpcDagMpc(const Field &in, Field &out) {
GRID_ASSERT(0);// Never need with staggered assert(0);// Never need with staggered
} }
}; };
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
@@ -623,7 +521,7 @@ template<class Field> class OperatorFunction {
public: public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0; virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) { virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
GRID_ASSERT(in.size()==out.size()); assert(in.size()==out.size());
for(int k=0;k<in.size();k++){ for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]); (*this)(Linop,in[k],out[k]);
} }
@@ -637,7 +535,7 @@ public:
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out) virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{ {
GRID_ASSERT(in.size() == out.size()); assert(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i) for (unsigned int i = 0; i < in.size(); ++i)
{ {

View File

@@ -45,11 +45,6 @@ public:
M(in,tmp); M(in,tmp);
Mdag(tmp,out); Mdag(tmp,out);
} }
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0; virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;

View File

@@ -59,7 +59,7 @@ public:
RealD diff = hi-lo; RealD diff = hi-lo;
RealD delta = diff*1.0e-9; RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) { for (RealD x=lo; x<hi; x+=delta) {
delta*=1.02; delta*=1.1;
RealD f = approx(x); RealD f = approx(x);
out<< x<<" "<<f<<std::endl; out<< x<<" "<<f<<std::endl;
} }
@@ -90,8 +90,9 @@ public:
order=_order; order=_order;
if(order < 2) exit(-1); if(order < 2) exit(-1);
Coeffs.resize(order,0.0); Coeffs.resize(order);
Coeffs[order-1] = 1.0; Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
}; };
// PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's. // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
@@ -131,26 +132,6 @@ public:
Coeffs[j] = s * 2.0/order; Coeffs[j] = s * 2.0/order;
} }
}; };
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){ void JacksonSmooth(void){
@@ -269,9 +250,7 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in); axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;

View File

@@ -40,7 +40,7 @@ public:
RealD norm; RealD norm;
RealD lo,hi; RealD lo,hi;
MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), tolerances(n), lo(_lo), hi(_hi) {;}; MultiShiftFunction(int n,RealD _lo,RealD _hi): poles(n), residues(n), lo(_lo), hi(_hi) {;};
RealD approx(RealD x); RealD approx(RealD x);
void csv(std::ostream &out); void csv(std::ostream &out);
void gnuplot(std::ostream &out); void gnuplot(std::ostream &out);

View File

@@ -121,7 +121,7 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
// Reallocate arrays, since degree has changed // Reallocate arrays, since degree has changed
if (num_degree != n || den_degree != d) allocate(num_degree,den_degree); if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
GRID_ASSERT(a_len<=SUM_MAX); assert(a_len<=SUM_MAX);
step = new bigfloat[num_degree+den_degree+2]; step = new bigfloat[num_degree+den_degree+2];
@@ -151,9 +151,9 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Delta too small, try increasing precision\n"; std::cout<<"Delta too small, try increasing precision\n";
GRID_ASSERT(0); assert(0);
}; };
GRID_ASSERT( delta>= tolerance); assert( delta>= tolerance);
search(step); search(step);
} }

View File

@@ -134,7 +134,7 @@ class AlgRemez
virtual ~AlgRemez(); virtual ~AlgRemez();
int getDegree(void){ int getDegree(void){
GRID_ASSERT(n==d); assert(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation

View File

@@ -28,11 +28,11 @@ void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyTy
pow_n = num_degree; pow_n = num_degree;
pow_d = den_degree; pow_d = den_degree;
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) GRID_ASSERT(0); if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) GRID_ASSERT(0); if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) GRID_ASSERT(0); if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) GRID_ASSERT(0); if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
num_type = num_type_in; num_type = num_type_in;
den_type = den_type_in; den_type = den_type_in;
@@ -112,9 +112,9 @@ double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degre
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n"; std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
GRID_ASSERT(0); assert(0);
}; };
GRID_ASSERT( delta>= tolerance ); assert( delta>= tolerance );
search(); search();
} }
@@ -278,7 +278,7 @@ void AlgRemezGeneral::equations(){
if(num_pows[j] != -1){ *aa++ = z; t++; } if(num_pows[j] != -1){ *aa++ = z; t++; }
z *= x; z *= x;
} }
GRID_ASSERT(t == n+1); assert(t == n+1);
z = (bigfloat)1l; z = (bigfloat)1l;
t = 0; t = 0;
@@ -286,7 +286,7 @@ void AlgRemezGeneral::equations(){
if(den_pows[j] != -1){ *aa++ = -y * z; t++; } if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
z *= x; z *= x;
} }
GRID_ASSERT(t == d); assert(t == d);
B[i] = y * z; // Right hand side vector B[i] = y * z; // Right hand side vector
} }

View File

@@ -106,7 +106,7 @@ class AlgRemezGeneral{
bigfloat (*f)(bigfloat x, void *data), void *data); bigfloat (*f)(bigfloat x, void *data), void *data);
inline int getDegree(void) const{ inline int getDegree(void) const{
GRID_ASSERT(n==d); assert(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation

View File

@@ -74,7 +74,7 @@ bigfloat epsilonMobius(bigfloat x, void* data){
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in, const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound){ const RealD lambda_bound){
GRID_ASSERT(omega_in.size() == Ls_in); assert(omega_in.size() == Ls_in);
omega_out.resize(Ls_out); omega_out.resize(Ls_out);
//Use the Remez algorithm to generate the appropriate rational polynomial //Use the Remez algorithm to generate the appropriate rational polynomial

File diff suppressed because it is too large Load Diff

View File

@@ -1,300 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MomentumProject.h
Copyright (C) 2025
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiMomProject
Import vectors -> nxyz x (ncomponent x nt)
Import complex phases -> nmom x nxy
apply = via (possibly batched) GEMM
*/
template<class Field, class ComplexField>
class MomentumProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
GridBase *grid;
uint64_t nmom;
uint64_t nxyz;
uint64_t nt;
uint64_t nbtw;
uint64_t words;
deviceVector<scalar> BLAS_V; //
deviceVector<scalar> BLAS_M; //
deviceVector<scalar> BLAS_P; //
MomentumProject(){};
~MomentumProject(){ Deallocate(); };
void Deallocate(void)
{
grid=nullptr;
nmom=0;
nxyz=0;
nt=0;
nbtw=0;
words=0;
BLAS_V.resize(0);
BLAS_M.resize(0);
BLAS_P.resize(0);
}
void Allocate(int _nmom,GridBase *_grid)
{
grid=_grid;
Coordinate ldims = grid->LocalDimensions();
nmom=_nmom;
nt = ldims[grid->Nd()-1];
nxyz = grid->lSites()/nt;
words = sizeof(scalar_object)/sizeof(scalar);
nbtw = nt * words;
BLAS_V.resize (nxyz * nt * words );
BLAS_M.resize (nmom * nxyz );
BLAS_P.resize (nmom * nt * words );
}
void ImportMomenta(const std::vector <ComplexField> &momenta)
{
GRID_ASSERT(momenta.size()==nmom);
// might as well just make the momenta here
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_M.size();
GRID_ASSERT(momenta.size()==nmom)
GRID_ASSERT(momenta[0].Grid()==grid);
GRID_ASSERT(sz = nxyz * nmom);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims = grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords = words; // local variable for copy in to GPU
int64_t Nxyz = nxyz;
auto blasData_p = &BLAS_M[0];
for(int m=0;m<momenta.size();m++){
autoView( Data , momenta[m], AcceleratorRead);
auto Data_p = &Data[0];
accelerator_for(xyz,nxyz,1,{
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Lexicographic::CoorFromIndex(lcoor,xyz,ldims);
Coordinate icoor(nd);
Coordinate ocoor(nd);
for (int d = 0; d < nd; d++) {
icoor[d] = lcoor[d]/rdimensions[d];
ocoor[d] = lcoor[d]%rdimensions[d];
}
int64_t osite;
int64_t isite;
Lexicographic::IndexFromCoor(ocoor,osite,rdimensions);
Lexicographic::IndexFromCoor(icoor,isite,simd);
// BLAS_M[nmom][slice_vol]
// Fortran Column major BLAS layout is M_xyz,mom
scalar data = extractLane(isite,Data[osite]);
uint64_t idx = xyz+m*Nxyz;
blasData_p[idx] = data;
});
}
}
void ImportVector(Field &vec)
{
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_V.size();
GRID_ASSERT(sz = nxyz * words * nt);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims= grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords= words; // local variable for copy in to GPU
auto blasData_p = &BLAS_V[0];
autoView( Data , vec, AcceleratorRead);
auto Data_p = &Data[0];
int64_t nwords = words;// for capture
int64_t Nt = nt;// for capture
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Coordinate icoor(nd);
Coordinate ocoor(nd);
Lexicographic::CoorFromIndex(icoor,lane,simd);
Lexicographic::CoorFromIndex(ocoor,sf,rdimensions);
int64_t l_xyz = 0;
for (int d = 0; d < nd; d++) {
lcoor[d] = rdimensions[d]*icoor[d] + ocoor[d];
}
uint64_t l_t = lcoor[nd-1];
Coordinate xyz_coor = lcoor;
xyz_coor[nd-1] =0;
Lexicographic::IndexFromCoor(xyz_coor,l_xyz,ldims);
scalar_object data = extractLane(lane,Data[sf]);
scalar *data_words = (scalar *) &data;
for(int w = 0 ; w < nwords; w++) {
// BLAS_V[slice_vol][nt][words]
// Fortran Column major BLAS layout is V_(t,w)_xyz
uint64_t idx = w+l_t*nwords + l_xyz * nwords * Nt;
blasData_p[idx] = data_words[w];
}
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
void ExportMomentumProjection(std::vector<typename Field::scalar_object> &projection)
{
projection.resize(nmom*nt);
acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar));
// Could decide on a layout late?
}
// Row major layout "C" order:
// BLAS_V[slice_vol][nt][words]
// BLAS_M[nmom][slice_vol]
// BLAS_P[nmom][nt][words]
//
// Fortran Column major BLAS layout is V_(w,t)_xyz
// Fortran Column major BLAS layout is M_xyz,mom
// Fortran Column major BLAS layout is P_(w,t),mom
//
// Projected
//
// P = (V * M)_(w,t),mom
//
void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata)
{
double t_import=0;
double t_export=0;
double t_gemm =0;
double t_allreduce=0;
t_import-=usecond();
this->ImportVector(data);
std::vector< typename Field::scalar_object > projected_planes;
deviceVector<scalar *> Vd(1);
deviceVector<scalar *> Md(1);
deviceVector<scalar *> Pd(1);
scalar * Vh = & BLAS_V[0];
scalar * Mh = & BLAS_M[0];
scalar * Ph = & BLAS_P[0];
acceleratorPut(Vd[0],Vh);
acceleratorPut(Md[0],Mh);
acceleratorPut(Pd[0],Ph);
t_import+=usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// P_im = VMmx . Vxi
/////////////////////////////////////////
t_gemm-=usecond();
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
words*nt,nmom,nxyz,
scalar(1.0),
Vd,
Md,
scalar(0.0), // wipe out result
Pd);
BLAS.synchronise();
t_gemm+=usecond();
t_export-=usecond();
ExportMomentumProjection(projected_planes); // resizes
t_export+=usecond();
/////////////////////////////////
// Reduce across MPI ranks
/////////////////////////////////
int nd = grid->Nd();
int gt = grid->GlobalDimensions()[nd-1];
int lt = grid->LocalDimensions()[nd-1];
projected_gdata.resize(gt*nmom);
for(int t=0;t<gt*nmom;t++){ // global Nt array with zeroes for stuff not on this node
projected_gdata[t]=Zero();
}
for(int t=0;t<lt;t++){
for(int m=0;m<nmom;m++){
int st = grid->LocalStarts()[nd-1];
projected_gdata[t+st + gt*m] = projected_planes[t+lt*m];
}}
t_allreduce-=usecond();
grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words);
t_allreduce+=usecond();
std::cout << GridLogPerformance<<" MomentumProject t_import "<<t_import<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_export "<<t_export<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_gemm "<<t_gemm<<"us"<<std::endl;
std::cout << GridLogPerformance<<" MomentumProject t_reduce "<<t_allreduce<<"us"<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@@ -1,376 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
GRID_ASSERT(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);

View File

@@ -1,513 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSDeflation.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiRHS block projection
Import basis -> nblock x nbasis x (block x internal)
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
=> coarse_(nrhs x nbasis )^block = via batched GEMM
//template<class vobj,class CComplex,int nbasis,class VLattice>
//inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
// const VLattice &fineData,
// const VLattice &Basis)
*/
template<class Field>
class MultiRHSBlockProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef Field Fermion;
int nbasis;
GridBase *coarse_grid;
GridBase *fine_grid;
uint64_t block_vol;
uint64_t fine_vol;
uint64_t coarse_vol;
uint64_t words;
// Row major layout "C" order:
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
/*
* in Fortran column major notation (cuBlas order)
*
* Vxb = [v1(x)][..][vn(x)] ... x coarse vol
*
* Fxr = [r1(x)][..][rm(x)] ... x coarse vol
*
* Block project:
* C_br = V^dag F x coarse vol
*
* Block promote:
* F_xr = Vxb Cbr x coarse_vol
*/
deviceVector<scalar> BLAS_V; // words * block_vol * nbasis x coarse_vol
deviceVector<scalar> BLAS_F; // nrhs x fine_vol * words -- the sources
deviceVector<scalar> BLAS_C; // nrhs x coarse_vol * nbasis -- the coarse coeffs
RealD blasNorm2(deviceVector<scalar> &blas)
{
scalar ss(0.0);
std::vector<scalar> tmp(blas.size());
acceleratorCopyFromDevice(&blas[0],&tmp[0],blas.size()*sizeof(scalar));
for(int64_t s=0;s<blas.size();s++){
ss=ss+tmp[s]*adj(tmp[s]);
}
coarse_grid->GlobalSum(ss);
return real(ss);
}
MultiRHSBlockProject(){};
~MultiRHSBlockProject(){ Deallocate(); };
void Deallocate(void)
{
nbasis=0;
coarse_grid=nullptr;
fine_grid=nullptr;
fine_vol=0;
block_vol=0;
coarse_vol=0;
words=0;
BLAS_V.resize(0);
BLAS_F.resize(0);
BLAS_C.resize(0);
}
void Allocate(int _nbasis,GridBase *_fgrid,GridBase *_cgrid)
{
nbasis=_nbasis;
fine_grid=_fgrid;
coarse_grid=_cgrid;
fine_vol = fine_grid->lSites();
coarse_vol = coarse_grid->lSites();
block_vol = fine_vol/coarse_vol;
words = sizeof(scalar_object)/sizeof(scalar);
BLAS_V.resize (fine_vol * words * nbasis );
}
void ImportFineGridVectors(std::vector <Field > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename Field::vector_object vobj;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
}
uint64_t sz = blas.size();
acceleratorMemSet(&blas[0],0,blas.size()*sizeof(scalar));
Coordinate fine_rdimensions = fine_grid->_rdimensions;
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
int64_t bv= block_vol;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector importing vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
autoView( fineData , vecs[v], AcceleratorRead);
auto blasData_p = &blas[0];
auto fineData_p = &fineData[0];
int64_t osites = fine_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
// std::cout << "sz "<<sz<<std::endl;
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
GRID_ASSERT(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
uint64_t lwords= words; // local variable for copy in to GPU
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
// One thread per fine site
Coordinate coor_f(_ndimension);
Coordinate coor_b(_ndimension);
Coordinate coor_c(_ndimension);
// Fine site to fine coor
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
int sc;// coarse site
int sb;// block site
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
scalar_object data = extractLane(lane,fineData[sf]);
// BLAS layout address calculation
// words * block_vol * nbasis x coarse_vol
// coarse oSite x block vole x lanes
int64_t site = (lane*osites + sc*bv)*nvec
+ v*bv
+ sb;
// GRID_ASSERT(site*lwords<sz);
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
*ptr = data;
#ifdef GRID_SIMT
}
#else
}
#endif
});
// std::cout << " import fine Blas norm "<<blasNorm2(blas)<<std::endl;
// std::cout << " BlockProjector imported vector"<<v<<std::endl;
}
}
void ExportFineGridVectors(std::vector <Field> &vecs, deviceVector<scalar> &blas)
{
typedef typename Field::vector_object vobj;
int nvec = vecs.size();
GRID_ASSERT(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension;
GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine_grid->_rdimensions[d] / coarse_grid->_rdimensions[d];
}
Coordinate fine_rdimensions = fine_grid->_rdimensions;
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
// std::cout << " export fine Blas norm "<<blasNorm2(blas)<<std::endl;
int64_t bv= block_vol;
for(int v=0;v<vecs.size();v++){
autoView( fineData , vecs[v], AcceleratorWrite);
auto blasData_p = &blas[0];
auto fineData_p = &fineData[0];
int64_t osites = fine_grid->oSites();
uint64_t lwords = words;
// std::cout << " Nsimd is "<<vobj::Nsimd() << std::endl;
// std::cout << " lwords is "<<lwords << std::endl;
// std::cout << " sizeof(scalar_object) is "<<sizeof(scalar_object) << std::endl;
// loop over fine sites
accelerator_for(sf,osites,vobj::Nsimd(),{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(vobj::Nsimd()); // buffer lane
#else
for(int lane=0;lane<vobj::Nsimd();lane++) {
#endif
// One thread per fine site
Coordinate coor_f(_ndimension);
Coordinate coor_b(_ndimension);
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
for(int d=0;d<_ndimension;d++) coor_b[d] = coor_f[d]%block_r[d];
for(int d=0;d<_ndimension;d++) coor_c[d] = coor_f[d]/block_r[d];
int sc;
int sb;
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
Lexicographic::IndexFromCoor(coor_b,sb,block_r);
// BLAS layout address calculation
// words * block_vol * nbasis x coarse_vol
int64_t site = (lane*osites + sc*bv)*nvec
+ v*bv
+ sb;
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
scalar_object data = *ptr;
insertLane(lane,fineData[sf],data);
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
}
template<class vobj>
void ImportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
uint64_t sz = blas.size();
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector importing coarse vector"<<v<<" "<<norm2(vecs[v])<<std::endl;
autoView( coarseData , vecs[v], AcceleratorRead);
auto blasData_p = &blas[0];
auto coarseData_p = &coarseData[0];
int64_t osites = coarse_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
// C_br per site
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
coarse_scalar_object data = extractLane(lane,coarseData[sc]);
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
*ptr = data;
#ifdef GRID_SIMT
}
#else
}
#endif
});
// std::cout << " import coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
}
}
template<class vobj>
void ExportCoarseGridVectors(std::vector <Lattice<vobj> > &vecs, deviceVector<scalar> &blas)
{
int nvec = vecs.size();
typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
GRID_ASSERT(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension;
uint64_t sz = blas.size();
Coordinate coarse_rdimensions = coarse_grid->_rdimensions;
// std::cout << " export coarsee Blas norm "<<blasNorm2(blas)<<std::endl;
for(int v=0;v<vecs.size();v++){
// std::cout << " BlockProjector exporting coarse vector"<<v<<std::endl;
autoView( coarseData , vecs[v], AcceleratorWrite);
auto blasData_p = &blas[0];
auto coarseData_p = &coarseData[0];
int64_t osites = coarse_grid->oSites();
// loop over fine sites
const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
GRID_ASSERT(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
int64_t blas_site = (lane*osites + sc)*nvec*cwords + v*cwords;
coarse_scalar_object * ptr = (coarse_scalar_object *)&blasData_p[blas_site];
coarse_scalar_object data = *ptr;
insertLane(lane,coarseData[sc],data);
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
}
void ImportBasis(std::vector < Field > &vecs)
{
// std::cout << " BlockProjector Import basis size "<<vecs.size()<<std::endl;
ImportFineGridVectors(vecs,BLAS_V);
}
template<class cobj>
void blockProject(std::vector<Field> &fine,std::vector< Lattice<cobj> > & coarse)
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
/////////////////////////////////////////////
// Copy in the multi-rhs sources to same data layout
/////////////////////////////////////////////
// std::cout << "BlockProject import fine"<<std::endl;
ImportFineGridVectors(fine,BLAS_F);
deviceVector<scalar *> Vd(coarse_vol);
deviceVector<scalar *> Fd(coarse_vol);
deviceVector<scalar *> Cd(coarse_vol);
// std::cout << "BlockProject pointers"<<std::endl;
for(int c=0;c<coarse_vol;c++){
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
acceleratorPut(Vd[c],Vh);
acceleratorPut(Fd[c],Fh);
acceleratorPut(Cd[c],Ch);
}
GridBLAS BLAS;
// std::cout << "BlockProject BLAS"<<std::endl;
int64_t vw = block_vol * words;
/////////////////////////////////////////
// C_br = V^dag R
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw,
scalar(1.0),
Vd,
Fd,
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl;
ExportCoarseGridVectors(coarse, BLAS_C);
// std::cout << "BlockProject done"<<std::endl;
}
template<class cobj>
void blockPromote(std::vector<Field> &fine,std::vector<Lattice<cobj> > & coarse)
{
int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
GRID_ASSERT(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs );
ImportCoarseGridVectors(coarse, BLAS_C);
GridBLAS BLAS;
deviceVector<scalar *> Vd(coarse_vol);
deviceVector<scalar *> Fd(coarse_vol);
deviceVector<scalar *> Cd(coarse_vol);
for(int c=0;c<coarse_vol;c++){
// BLAS_V[coarse_vol][nbasis][block_vol][words]
// BLAS_F[coarse_vol][nrhs][block_vol][words]
// BLAS_C[coarse_vol][nrhs][nbasis]
scalar * Vh = & BLAS_V[c*nbasis*block_vol*words];
scalar * Fh = & BLAS_F[c*nrhs*block_vol*words];
scalar * Ch = & BLAS_C[c*nrhs*nbasis];
acceleratorPut(Vd[c],Vh);
acceleratorPut(Fd[c],Fh);
acceleratorPut(Cd[c],Ch);
}
/////////////////////////////////////////
// Block promote:
// F_xr = Vxb Cbr (x coarse_vol)
/////////////////////////////////////////
int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis,
scalar(1.0),
Vd,
Cd,
scalar(0.0), // wipe out C
Fd);
BLAS.synchronise();
// std::cout << " blas call done"<<std::endl;
ExportFineGridVectors(fine, BLAS_F);
// std::cout << " exported "<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@@ -1,233 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSDeflation.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs projection
i) MultiRHS Deflation
Import Evecs -> nev x vol x internal
Import vector of Lattice objects -> nrhs x vol x internal
=> Cij (nrhs x Nev) via GEMM.
=> Guess (nrhs x vol x internal) = C x evecs (via GEMM)
Export
ii) MultiRHS block projection
Import basis -> nblock x nbasis x (block x internal)
Import vector of fine lattice objects -> nblock x nrhs x (block x internal)
=> coarse_(nrhs x nbasis )^block = via batched GEMM
iii) Alternate interface:
Import higher dim Lattice object-> vol x nrhs layout
*/
template<class Field>
class MultiRHSDeflation
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
int nev;
std::vector<RealD> eval;
GridBase *grid;
uint64_t vol;
uint64_t words;
deviceVector<scalar> BLAS_E; // nev x vol -- the eigenbasis (up to a 1/sqrt(lambda))
deviceVector<scalar> BLAS_R; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_G; // nrhs x vol -- the guess
deviceVector<scalar> BLAS_C; // nrhs x nev -- the coefficients
MultiRHSDeflation(){};
~MultiRHSDeflation(){ Deallocate(); };
void Deallocate(void)
{
nev=0;
grid=nullptr;
vol=0;
words=0;
BLAS_E.resize(0);
BLAS_R.resize(0);
BLAS_C.resize(0);
BLAS_G.resize(0);
}
void Allocate(int _nev,GridBase *_grid)
{
nev=_nev;
grid=_grid;
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
eval.resize(nev);
BLAS_E.resize (vol * words * nev );
std::cout << GridLogMessage << " Allocate for "<<nev<<" eigenvectors and volume "<<vol<<std::endl;
}
void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
GRID_ASSERT(ev<eval.size());
eval[ev] = _eval;
int64_t offset = ev*vol*words;
autoView(v,evec,AcceleratorRead);
acceleratorCopyDeviceToDevice(&v[0],&BLAS_E[offset],sizeof(scalar_object)*vol);
}
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval)
{
ImportEigenBasis(evec,_eval,0,evec.size());
}
// Could use to import a batch of eigenvectors
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
{
GRID_ASSERT(_ev0+_nev<=evec.size());
Allocate(_nev,evec[0].Grid());
// Imports a sub-batch of eigenvectors, _ev0, ..., _ev0+_nev-1
for(int e=0;e<nev;e++){
std::cout << "Importing eigenvector "<<e<<" evalue "<<_eval[_ev0+e]<<std::endl;
ImportEigenVector(evec[_ev0+e],_eval[_ev0+e],e);
}
}
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
{
int nrhs = source.size();
GRID_ASSERT(source.size()==guess.size());
GRID_ASSERT(grid == guess[0].Grid());
conformable(guess[0],source[0]);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_R.resize(nrhs * vw); // cost free if size doesn't change
BLAS_G.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nev * nrhs);// cost free if size doesn't change
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
// for(int r=0;r<nrhs;r++){
// std::cout << " source["<<r<<"] = "<<norm2(source[r])<<std::endl;
// }
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(v,source[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&v[0],&BLAS_R[offset],sizeof(scalar_object)*vol);
}
/*
* in Fortran column major notation (cuBlas order)
*
* Exe = [e1(x)][..][en(x)]
*
* Rxr = [r1(x)][..][rm(x)]
*
* C_er = E^dag R
* C_er = C_er / lambda_e
* G_xr = Exe Cer
*/
deviceVector<scalar *> Ed(1);
deviceVector<scalar *> Rd(1);
deviceVector<scalar *> Cd(1);
deviceVector<scalar *> Gd(1);
scalar * Eh = & BLAS_E[0];
scalar * Rh = & BLAS_R[0];
scalar * Ch = & BLAS_C[0];
scalar * Gh = & BLAS_G[0];
acceleratorPut(Ed[0],Eh);
acceleratorPut(Rd[0],Rh);
acceleratorPut(Cd[0],Ch);
acceleratorPut(Gd[0],Gh);
GridBLAS BLAS;
/////////////////////////////////////////
// C_er = E^dag R
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw,
scalar(1.0),
Ed,
Rd,
scalar(0.0), // wipe out C
Cd);
BLAS.synchronise();
GRID_ASSERT(BLAS_C.size()==nev*nrhs);
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nev*nrhs);
for(int e=0;e<nev;e++){
RealD lam(1.0/eval[e]);
for(int r=0;r<nrhs;r++){
int off = e+nev*r;
HOST_C[off]=HOST_C[off] * lam;
// std::cout << "C["<<e<<"]["<<r<<"] ="<<HOST_C[off]<< " eval[e] "<<eval[e] <<std::endl;
}
}
acceleratorCopyToDevice(&HOST_C[0],&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/////////////////////////////////////////
// Guess G_xr = Exe Cer
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev,
scalar(1.0),
Ed, // x . nev
Cd, // nev . nrhs
scalar(0.0),
Gd);
BLAS.synchronise();
///////////////////////////////////////
// Copy out the multirhs
///////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(v,guess[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_G[offset],&v[0],sizeof(scalar_object)*vol);
}
RealD t1 = usecond();
std::cout << GridLogMessage << "MultiRHSDeflation for "<<nrhs<<" sources with "<<nev<<" eigenvectors took " << (t1-t0)/1e3 <<" ms"<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@@ -33,111 +33,109 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* Script A = SolverMatrix * Script A = SolverMatrix
* Script P = Preconditioner * Script P = Preconditioner
* *
* Deflation methods considered
* -- Solve P A x = P b [ like Luscher ]
* DEF-1 M P A x = M P b [i.e. left precon]
* DEF-2 P^T M A x = P^T M b
* ADEF-1 Preconditioner = M P + Q [ Q + M + M A Q]
* ADEF-2 Preconditioner = P^T M + Q
* BNN Preconditioner = P^T M P + Q
* BNN2 Preconditioner = M P + P^TM +Q - M P A M
*
* Implement ADEF-2 * Implement ADEF-2
* *
* Vstart = P^Tx + Qb * Vstart = P^Tx + Qb
* M1 = P^TM + Q * M1 = P^TM + Q
* M2=M3=1 * M2=M3=1
* Vout = x
*/ */
NAMESPACE_BEGIN(Grid);
// abstract base
template<class Field> template<class Field, class CoarseField>
class TwoLevelCG : public LinearFunction<Field> class TwoLevelFlexiblePcg : public LinearFunction<Field>
{ {
public: public:
int verbose;
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
const int mmax = 5;
GridBase *grid; GridBase *grid;
GridBase *coarsegrid;
// Fine operator, Smoother, CoarseSolver LinearOperatorBase<Field> *_Linop
LinearOperatorBase<Field> &_FineLinop; OperatorFunction<Field> *_Smoother,
LinearFunction<Field> &_Smoother; LinearFunction<CoarseField> *_CoarseSolver;
// Need somthing that knows how to get from Coarse to fine and back again
// more most opertor functions // more most opertor functions
TwoLevelCG(RealD tol, TwoLevelFlexiblePcg(RealD tol,
Integer maxit, Integer maxit,
LinearOperatorBase<Field> &FineLinop, LinearOperatorBase<Field> *Linop,
LinearFunction<Field> &Smoother, LinearOperatorBase<Field> *SmootherLinop,
GridBase *fine) : OperatorFunction<Field> *Smoother,
OperatorFunction<CoarseField> CoarseLinop
) :
Tolerance(tol), Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
_FineLinop(FineLinop), _Linop(Linop),
_Smoother(Smoother) _PreconditionerLinop(PrecLinop),
_Preconditioner(Preconditioner)
{ {
grid = fine; verbose=0;
}; };
virtual void operator() (const Field &src, Field &x) // The Pcg routine is common to all, but the various matrices differ from derived
{ // implementation to derived implmentation
std::cout << GridLogMessage<<"HDCG: fPcg starting single RHS"<<std::endl; void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){
psi.Checkerboard() = src.Checkerboard();
grid = src.Grid();
RealD f; RealD f;
RealD rtzp,rtz,a,d,b; RealD rtzp,rtz,a,d,b;
RealD rptzp; RealD rptzp;
RealD tn;
RealD guess = norm2(psi);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
///////////////////////////// /////////////////////////////
// Set up history vectors // Set up history vectors
///////////////////////////// /////////////////////////////
int mmax = 5;
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
std::vector<Field> p (mmax,grid); std::vector<Field> p (mmax,grid);
std::vector<Field> mmp(mmax,grid); std::vector<Field> mmp(mmax,grid);
std::vector<RealD> pAp(mmax); std::vector<RealD> pAp(mmax);
Field x (grid); x = psi;
Field z (grid); Field z (grid);
Field tmp(grid); Field tmp(grid);
Field mp (grid);
Field r (grid); Field r (grid);
Field mu (grid); Field mu (grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated"<<std::endl;
//Initial residual computation & set up
RealD guess = norm2(x);
std::cout << GridLogMessage<<"HDCG: fPcg guess nrm "<<guess<<std::endl;
RealD src_nrm = norm2(src);
std::cout << GridLogMessage<<"HDCG: fPcg src nrm "<<src_nrm<<std::endl;
if ( src_nrm == 0.0 ) {
std::cout << GridLogMessage<<"HDCG: fPcg given trivial source norm "<<src_nrm<<std::endl;
x=Zero();
}
RealD tn;
GridStopWatch HDCGTimer;
HDCGTimer.Start();
////////////////////////// //////////////////////////
// x0 = Vstart -- possibly modify guess // x0 = Vstart -- possibly modify guess
////////////////////////// //////////////////////////
x=src;
Vstart(x,src); Vstart(x,src);
// r0 = b -A x0 // r0 = b -A x0
_FineLinop.HermOp(x,mmp[0]); HermOp(x,mmp); // Shouldn't this be something else?
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0 axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
{
double n1 = norm2(x);
double n2 = norm2(mmp[0]);
double n3 = norm2(r);
std::cout<<GridLogMessage<<"x,vstart,r = "<<n1<<" "<<n2<<" "<<n3<<std::endl;
}
////////////////////////////////// //////////////////////////////////
// Compute z = M1 x // Compute z = M1 x
////////////////////////////////// //////////////////////////////////
PcgM1(r,z); M1(r,z,tmp,mp,SmootherMirs);
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
/////////////////////////////////////// ///////////////////////////////////////
// Solve for Mss mu = P A z and set p = z-mu // Solve for Mss mu = P A z and set p = z-mu
// Def2 p = 1 - Q Az = Pright z // Def2: p = 1 - Q Az = Pright z
// Other algos M2 is trivial // Other algos M2 is trivial
/////////////////////////////////////// ///////////////////////////////////////
PcgM2(z,p[0]); M2(z,p[0]);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
std::cout << GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" rsq "<<rsq<<"\n";
Field pp(grid);
for (int k=0;k<=MaxIterations;k++){ for (int k=0;k<=MaxIterations;k++){
@@ -145,7 +143,7 @@ class TwoLevelCG : public LinearFunction<Field>
int peri_kp = (k+1) % mmax; int peri_kp = (k+1) % mmax;
rtz=rtzp; rtz=rtzp;
d= PcgM3(p[peri_k],mmp[peri_k]); d= M3(p[peri_k],mp,mmp[peri_k],tmp);
a = rtz/d; a = rtz/d;
// Memorise this // Memorise this
@@ -155,36 +153,21 @@ class TwoLevelCG : public LinearFunction<Field>
RealD rn = axpy_norm(r,-a,mmp[peri_k],r); RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
// Compute z = M x // Compute z = M x
PcgM1(r,z); M1(r,z,tmp,mp);
{
RealD n1,n2;
n1=norm2(r);
n2=norm2(z);
std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : vector r,z "<<n1<<" "<<n2<<"\n";
}
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
std::cout << GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : inner rtzp "<<rtzp<<"\n";
// PcgM2(z,p[0]); M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
p[peri_kp]=mu; p[peri_kp]=p[peri_k];
// Standard search direction p -> z + b p // Standard search direction p -> z + b p ; b =
b = (rtzp)/rtz; b = (rtzp)/rtz;
int northog; int northog;
// k=zero <=> peri_kp=1; northog = 1
// k=1 <=> peri_kp=2; northog = 2
// ... ... ...
// k=mmax-2<=> peri_kp=mmax-1; northog = mmax-1
// k=mmax-1<=> peri_kp=0; northog = 1
// northog = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm // northog = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
for(int back=0; back < northog; back++){ for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax; int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp])); RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
@@ -193,324 +176,75 @@ class TwoLevelCG : public LinearFunction<Field>
} }
RealD rrn=sqrt(rn/ssq); RealD rrn=sqrt(rn/ssq);
RealD rtn=sqrt(rtz/ssq); std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
RealD rtnp=sqrt(rtzp/ssq);
std::cout<<GridLogMessage<<"HDCG: fPcg k= "<<k<<" residual = "<<rrn<<"\n";
// Stopping condition // Stopping condition
if ( rn <= rsq ) { if ( rn <= rsq ) {
HDCGTimer.Stop(); HermOp(x,mmp); // Shouldn't this be something else?
std::cout<<GridLogMessage<<"HDCG: fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
_FineLinop.HermOp(x,mmp[0]);
axpy(tmp,-1.0,src,mmp[0]); axpy(tmp,-1.0,src,mmp[0]);
RealD mmpnorm = sqrt(norm2(mmp[0])); RealD psinorm = sqrt(norm2(x));
RealD xnorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src)); RealD srcnorm = sqrt(norm2(src));
RealD tmpnorm = sqrt(norm2(tmp)); RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm; RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
<<"HDCG: true residual is "<<true_residual std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
<<" solution "<<xnorm return k;
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
RealD xnorm = sqrt(norm2(x));
RealD srcnorm = sqrt(norm2(src));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
std::vector<RealD> f(nrhs);
std::vector<RealD> rtzp(nrhs);
std::vector<RealD> rtz(nrhs);
std::vector<RealD> a(nrhs);
std::vector<RealD> d(nrhs);
std::vector<RealD> b(nrhs);
std::vector<RealD> rptzp(nrhs);
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 3;
std::cout << GridLogMessage<<"HDCG: fPcg allocating"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated p"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated mmp"<<std::endl;
src[0].Grid()->Barrier();
std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
std::cout << GridLogMessage<<"HDCG: fPcg allocated pAp"<<std::endl;
src[0].Grid()->Barrier();
std::vector<Field> z(nrhs,grid);
std::vector<Field> mp (nrhs,grid);
std::vector<Field> r (nrhs,grid);
std::vector<Field> mu (nrhs,grid);
std::cout << GridLogMessage<<"HDCG: fPcg allocated z,mp,r,mu"<<std::endl;
src[0].Grid()->Barrier();
//Initial residual computation & set up
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
GRID_ASSERT(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
GridStopWatch HDCGTimer;
HDCGTimer.Start();
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(x,src);
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
// This needs a multiRHS version for acceleration
PcgM1(r,z);
std::vector<RealD> ssq(nrhs);
std::vector<RealD> rsq(nrhs);
std::vector<Field> pp(nrhs,grid);
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
p[rhs][0]=z[rhs];
ssq[rhs]=norm2(src[rhs]);
rsq[rhs]= ssq[rhs]*Tolerance*Tolerance;
std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
}
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
for(int rhs=0;rhs<nrhs;rhs++){
rtz[rhs]=rtzp[rhs];
d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
a[rhs] = rtz[rhs]/d[rhs];
// Memorise this
pAp[rhs][peri_k] = d[rhs];
axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
}
// Compute z = M x (for *all* RHS)
PcgM1(r,z);
std::cout << GridLogMessage<<"HDCG::fPcg M1 complete"<<std::endl;
grid->Barrier();
RealD max_rn=0.0;
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
mu[rhs]=z[rhs];
p[rhs][peri_kp]=mu[rhs];
// Standard search direction p == z + b p
b[rhs] = (rtzp[rhs])/rtz[rhs];
int northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
std::cout<<GridLogMessage<<"HDCG::fPcg iteration "<<k<<" : orthogonalising to last "<<northog<<" vectors\n";
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
RealD beta = -pbApk/pAp[rhs][peri_back];
axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
}
RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
std::cout<<GridLogMessage<<"HDCG: rhs "<<rhs<<"fPcg k= "<<k<<" residual = "<<rrn<<"\n";
if ( rrn > max_rn ) max_rn = rrn;
}
// Stopping condition based on worst case
if ( max_rn <= Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
Field tmp(grid);
axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
RealD mmpnorm = sqrt(norm2(mmp[rhs][0]));
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++){
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
} }
} }
// Non-convergence
assert(0);
}
public: public:
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out) virtual void M(Field & in,Field & out,Field & tmp) {
{
std::cout << "PcgM1 default (cheat) mrhs version"<<std::endl;
for(int rhs=0;rhs<in.size();rhs++){
this->PcgM1(in[rhs],out[rhs]);
}
}
virtual void PcgM1(Field & in, Field & out) =0;
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
std::cout << "Vstart default (cheat) mrhs version"<<std::endl;
for(int rhs=0;rhs<x.size();rhs++){
this->Vstart(x[rhs],src[rhs]);
}
}
virtual void Vstart(Field & x,const Field & src)=0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
} }
virtual RealD PcgM3(const Field & p, Field & mmp){ virtual void M1(Field & in, Field & out) {// the smoother
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout.
/////////////////////////////////////////////////////////////////////
};
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF2 : public TwoLevelCG<Field>
{
public:
///////////////////////////////////////////////////////////////////////////////////
// Need something that knows how to get from Coarse to fine and back again
// void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
// void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
///////////////////////////////////////////////////////////////////////////////////
GridBase *coarsegrid;
Aggregation &_Aggregates;
LinearFunction<CoarseField> &_CoarseSolver;
LinearFunction<CoarseField> &_CoarseSolverPrecise;
///////////////////////////////////////////////////////////////////////////////////
// more most opertor functions
TwoLevelADEF2(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolver,
LinearFunction<CoarseField> &CoarseSolverPrecise,
Aggregation &Aggregates
) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
_CoarseSolver(CoarseSolver),
_CoarseSolverPrecise(CoarseSolverPrecise),
_Aggregates(Aggregates)
{
coarsegrid = Aggregates.CoarseGrid;
};
virtual void PcgM1(Field & in, Field & out)
{
GRID_TRACE("MultiGridPreconditioner ");
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min] // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(grid);
Field Min(grid);
Field tmp(this->grid); PcgM(in,Min); // Smoother call
Field Min(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
GridStopWatch SmootherTimer; HermOp(Min,out);
GridStopWatch MatrixTimer;
SmootherTimer.Start();
this->_Smoother(in,Min);
SmootherTimer.Stop();
MatrixTimer.Start();
this->_FineLinop.HermOp(Min,out);
MatrixTimer.Stop();
axpy(tmp,-1.0,out,in); // tmp = in - A Min axpy(tmp,-1.0,out,in); // tmp = in - A Min
GridStopWatch ProjTimer; ProjectToSubspace(tmp,PleftProj);
GridStopWatch CoarseTimer; ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
GridStopWatch PromTimer; PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
ProjTimer.Start();
this->_Aggregates.ProjectToSubspace(PleftProj,tmp);
ProjTimer.Stop();
CoarseTimer.Start();
this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
CoarseTimer.Stop();
PromTimer.Start();
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
PromTimer.Stop();
std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tSmoother " << SmootherTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProj " << ProjTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tCoarse " << CoarseTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProm " << PromTimer.Elapsed() <<std::endl;
axpy(out,1.0,Min,tmp); // Min+tmp axpy(out,1.0,Min,tmp); // Min+tmp
} }
virtual void Vstart(Field & x,const Field & src) virtual void M2(const Field & in, Field & out) {
{ out=in;
std::cout << GridLogMessage<<"HDCG: fPcg Vstart "<<std::endl; // Must override for Def2 only
// case PcgDef2:
// Pright(in,out);
// break;
}
virtual RealD M3(const Field & p, Field & mmp){
double d,dd;
HermOpAndNorm(p,mmp,d,dd);
return dd;
// Must override for Def1 only
// case PcgDef1:
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
// Pleft(mp,mmp);
// d=real(linop_d->inner(p,mmp));
}
virtual void VstartDef2(Field & xconst Field & src){
//case PcgDef2:
//case PcgAdef2:
//case PcgAdef2f:
//case PcgV11f:
/////////////////////////////////// ///////////////////////////////////
// Choose x_0 such that // Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess] // x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@@ -522,78 +256,142 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
// = src_s - (A guess)_s - src_s + (A guess)_s // = src_s - (A guess)_s - src_s + (A guess)_s
// = 0 // = 0
/////////////////////////////////// ///////////////////////////////////
Field r(this->grid); Field r(grid);
Field mmp(this->grid); Field mmp(grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
std::cout << GridLogMessage<<"HDCG: fPcg Vstart projecting "<<std::endl; HermOp(x,mmp);
this->_Aggregates.ProjectToSubspace(PleftProj,src); axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
std::cout << GridLogMessage<<"HDCG: fPcg Vstart coarse solve "<<std::endl; ProjectToSubspace(r,PleftProj);
this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
std::cout << GridLogMessage<<"HDCG: fPcg Vstart promote "<<std::endl; PromoteFromSubspace(PleftMss_proj,mmp);
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x); x=x+mmp;
} }
}; virtual void Vstart(Field & x,const Field & src){
return;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout. Override in Def1
/////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){
out = in;
//case PcgDef1:
// //Qb + PT x
// ProjectToSubspace(src,PleftProj);
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
// PromoteFromSubspace(PleftMss_proj,tmp);
//
// Pright(in,out);
//
// linop_d->axpy(out,tmp,out,1.0);
// break;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Pright and Pleft are common to all implementations
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void Pright(Field & in,Field & out){
// P_R = [ 1 0 ]
// [ -Mss^-1 Msb 0 ]
Field in_sbar(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
HermOp(in_sbar,out);
ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project)
ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
PromoteFromSubspace(PleftMss_proj,out); //
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
}
virtual void Pleft (Field & in,Field & out){
// P_L = [ 1 -Mbs Mss^-1]
// [ 0 0 ]
Field in_sbar(grid);
Field tmp2(grid);
Field Mtmp(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
PromoteFromSubspace(PleftMss_proj,out);
HermOp(out,Mtmp);
ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1}
PromoteFromSubspace(PleftProj,tmp2);
axpy(out,-1.0,tmp2,Mtmp);
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
}
}
template<class Field> template<class Field>
class TwoLevelADEF1defl : public TwoLevelCG<Field> class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
{
public: public:
const std::vector<Field> &evec; virtual void M(Field & in,Field & out,Field & tmp){
const std::vector<RealD> &eval;
TwoLevelADEF1defl(RealD tol, }
Integer maxit, virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
std::vector<Field> &_evec,
std::vector<RealD> &_eval) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
evec(_evec),
eval(_eval)
{};
// Can just inherit existing M2 }
// Can just inherit existing M3 virtual void M2(Field & in, Field & out){
// Simple vstart - do nothing }
virtual void Vstart(Field & x,const Field & src){ virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
x=src; // Could apply Q
};
// Override PcgM1 }
virtual void PcgM1(Field & in, Field & out) virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
{
GRID_TRACE("EvecPreconditioner ");
int N=evec.size();
Field Pin(this->grid);
Field Qin(this->grid);
//MP + Q = M(1-AQ) + Q = M }
// // If we are eigenvector deflating in coarse space }
// // Q = Sum_i |phi_i> 1/lambda_i <phi_i| /*
// // A Q = Sum_i |phi_i> <phi_i| template<class Field>
// // M(1-AQ) = M(1-proj) + Q class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
Qin.Checkerboard()=in.Checkerboard(); public:
Qin = Zero(); virtual void M(Field & in,Field & out,Field & tmp);
Pin = in; virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
for (int i=0;i<N;i++) { virtual void M2(Field & in, Field & out);
const Field& tmp = evec[i]; virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
auto ip = TensorRemove(innerProduct(tmp,in)); virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
axpy(Qin, ip / eval[i],tmp,Qin);
axpy(Pin, -ip ,tmp,Pin);
} }
this->_Smoother(Pin,out); template<class Field>
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
out = out + Qin; public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
} }
};
NAMESPACE_END(Grid); template<class Field>
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
*/
#endif #endif

View File

@@ -1,734 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/AdefGeneric.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
/*
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
* Script A = SolverMatrix
* Script P = Preconditioner
*
* Implement ADEF-2
*
* Vstart = P^Tx + Qb
* M1 = P^TM + Q
* M2=M3=1
*/
NAMESPACE_BEGIN(Grid);
template<class Field>
class TwoLevelCGmrhs
{
public:
RealD Tolerance;
Integer MaxIterations;
GridBase *grid;
// Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer;
GridStopWatch DeflateTimer;
GridStopWatch CoarseTimer;
GridStopWatch FineTimer;
GridStopWatch SmoothTimer;
GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions
TwoLevelCGmrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
GridBase *fine) :
Tolerance(tol),
MaxIterations(maxit),
_FineLinop(FineLinop),
_Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{
grid = fine;
};
// Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); GRID_ASSERT(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
GRID_ASSERT(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
std::vector<RealD> f(nrhs);
std::vector<RealD> rtzp(nrhs);
std::vector<RealD> rtz(nrhs);
std::vector<RealD> a(nrhs);
std::vector<RealD> d(nrhs);
std::vector<RealD> b(nrhs);
std::vector<RealD> rptzp(nrhs);
/////////////////////////////
// Set up history vectors
/////////////////////////////
int mmax = 3;
std::vector<std::vector<Field> > p(nrhs); for(int r=0;r<nrhs;r++) p[r].resize(mmax,grid);
std::vector<std::vector<Field> > mmp(nrhs); for(int r=0;r<nrhs;r++) mmp[r].resize(mmax,grid);
std::vector<std::vector<RealD> > pAp(nrhs); for(int r=0;r<nrhs;r++) pAp[r].resize(mmax);
std::vector<Field> z(nrhs,grid);
std::vector<Field> mp (nrhs,grid);
std::vector<Field> r (nrhs,grid);
std::vector<Field> mu (nrhs,grid);
//Initial residual computation & set up
std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]);
GRID_ASSERT(src_nrm[rhs]!=0.0);
}
std::vector<RealD> tn(nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(x,src);
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
axpy (r[rhs], -1.0,mmp[rhs][0], src[rhs]); // Recomputes r=src-Ax0
}
//////////////////////////////////
// Compute z = M1 x
//////////////////////////////////
// This needs a multiRHS version for acceleration
PcgM1(r,z);
std::vector<RealD> ssq(nrhs);
std::vector<RealD> rsq(nrhs);
std::vector<Field> pp(nrhs,grid);
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
p[rhs][0]=z[rhs];
ssq[rhs]=norm2(src[rhs]);
rsq[rhs]= ssq[rhs]*Tolerance*Tolerance;
// std::cout << GridLogMessage<<"mrhs HDCG: "<<rhs<<" k=0 residual "<<rtzp[rhs]<<" rsq "<<rsq[rhs]<<"\n";
}
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
for(int rhs=0;rhs<nrhs;rhs++){
rtz[rhs]=rtzp[rhs];
M3Timer.Start();
d[rhs]= PcgM3(p[rhs][peri_k],mmp[rhs][peri_k]);
M3Timer.Stop();
a[rhs] = rtz[rhs]/d[rhs];
LinalgTimer.Start();
// Memorise this
pAp[rhs][peri_k] = d[rhs];
axpy(x[rhs],a[rhs],p[rhs][peri_k],x[rhs]);
rn[rhs] = axpy_norm(r[rhs],-a[rhs],mmp[rhs][peri_k],r[rhs]);
LinalgTimer.Stop();
}
// Compute z = M x (for *all* RHS)
M1Timer.Start();
PcgM1(r,z);
M1Timer.Stop();
RealD max_rn=0.0;
LinalgTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++){
rtzp[rhs] =real(innerProduct(r[rhs],z[rhs]));
// std::cout << GridLogMessage<<"HDCG::fPcg rhs"<<rhs<<" iteration "<<k<<" : inner rtzp "<<rtzp[rhs]<<"\n";
mu[rhs]=z[rhs];
p[rhs][peri_kp]=mu[rhs];
// Standard search direction p == z + b p
b[rhs] = (rtzp[rhs])/rtz[rhs];
int northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[rhs][peri_back],p[rhs][peri_kp]));
RealD beta = -pbApk/pAp[rhs][peri_back];
axpy(p[rhs][peri_kp],beta,p[rhs][peri_back],p[rhs][peri_kp]);
}
RealD rrn=sqrt(rn[rhs]/ssq[rhs]);
RealD rtn=sqrt(rtz[rhs]/ssq[rhs]);
RealD rtnp=sqrt(rtzp[rhs]/ssq[rhs]);
std::cout<<GridLogMessage<<"HDCG:fPcg rhs "<<rhs<<" k= "<<k<<" residual = "<<rrn<<"\n";
if ( rrn > max_rn ) max_rn = rrn;
}
LinalgTimer.Stop();
// Stopping condition based on worst case
if ( max_rn <= Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : fine M3 "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs fPcg : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(x[rhs],mmp[rhs][0]);
Field tmp(grid);
axpy(tmp,-1.0,src[rhs],mmp[rhs][0]);
RealD mmpnorm = sqrt(norm2(mmp[rhs][0]));
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(tmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: not converged "<<HDCGTimer.Elapsed()<<std::endl;
for(int rhs=0;rhs<nrhs;rhs++){
RealD xnorm = sqrt(norm2(x[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
}
}
public:
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out) = 0;
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src) = 0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
}
virtual RealD PcgM3(const Field & p, Field & mmp){
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
};
template<class Field, class CoarseField>
class TwoLevelADEF2mrhs : public TwoLevelCGmrhs<Field>
{
public:
GridBase *coarsegrid;
GridBase *coarsegridmrhs;
LinearFunction<CoarseField> &_CoarseSolverMrhs;
LinearFunction<CoarseField> &_CoarseSolverPreciseMrhs;
MultiRHSBlockProject<Field> &_Projector;
MultiRHSDeflation<CoarseField> &_Deflator;
TwoLevelADEF2mrhs(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolverMrhs,
LinearFunction<CoarseField> &CoarseSolverPreciseMrhs,
MultiRHSBlockProject<Field> &Projector,
MultiRHSDeflation<CoarseField> &Deflator,
GridBase *_coarsemrhsgrid) :
TwoLevelCGmrhs<Field>(tol, maxit,FineLinop,Smoother,Projector.fine_grid),
_CoarseSolverMrhs(CoarseSolverMrhs),
_CoarseSolverPreciseMrhs(CoarseSolverPreciseMrhs),
_Projector(Projector),
_Deflator(Deflator)
{
coarsegrid = Projector.coarse_grid;
coarsegridmrhs = _coarsemrhsgrid;// Thi could be in projector
};
// Override Vstart
virtual void Vstart(std::vector<Field> & x,std::vector<Field> & src)
{
int nrhs=x.size();
///////////////////////////////////
// Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
// = [1 - Ass_inv A] Guess + Assinv src
// = P^T guess + Assinv src
// = Vstart [Tang notation]
// This gives:
// W^T (src - A x_0) = src_s - A guess_s - r_s
// = src_s - (A guess)_s - src_s + (A guess)_s
// = 0
///////////////////////////////////
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
this->_Projector.blockProject(src,PleftProj);
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->_CoarseSolverPreciseMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} r_s
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->_Projector.blockPromote(x,PleftMss_proj);
}
virtual void PcgM1(std::vector<Field> & in,std::vector<Field> & out){
int nrhs=in.size();
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
std::vector<Field> tmp(nrhs,this->grid);
std::vector<Field> Min(nrhs,this->grid);
std::vector<CoarseField> PleftProj(nrhs,this->coarsegrid);
std::vector<CoarseField> PleftMss_proj(nrhs,this->coarsegrid);
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop();
}
this->ProjectTimer.Start();
this->_Projector.blockProject(tmp,PleftProj);
this->ProjectTimer.Stop();
this->DeflateTimer.Start();
this->_Deflator.DeflateSources(PleftProj,PleftMss_proj);
this->DeflateTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
InsertSliceFast(PleftProj[rhs],PleftProjMrhs,rhs,0);
InsertSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0); // the guess
}
this->InsertTimer.Stop();
this->CoarseTimer.Start();
this->_CoarseSolverMrhs(PleftProjMrhs,PleftMss_projMrhs); // Ass^{-1} [in - A Min]_s
this->CoarseTimer.Stop();
this->InsertTimer.Start();
for(int rhs=0;rhs<nrhs;rhs++) {
ExtractSliceFast(PleftMss_proj[rhs],PleftMss_projMrhs,rhs,0);
}
this->InsertTimer.Stop();
this->PromoteTimer.Start();
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop();
this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
}
// this->zzz=out[0];
this->FineTimer.Stop();
}
};
NAMESPACE_END(Grid);

View File

@@ -47,7 +47,7 @@ class BiCGSTAB : public OperatorFunction<Field>
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -77,7 +77,7 @@ class BiCGSTAB : public OperatorFunction<Field>
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop.Op(psi, v); Linop.Op(psi, v);
b = norm2(v); b = norm2(v);
@@ -214,7 +214,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
if(ErrorOnNoConverge){ GRID_ASSERT(true_residual / Tolerance < 10000.0); } if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
IterationsToComplete = k; IterationsToComplete = k;
@@ -224,7 +224,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl; std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
if(ErrorOnNoConverge){ GRID_ASSERT(0); } if(ErrorOnNoConverge){ assert(0); }
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };

View File

@@ -31,58 +31,6 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -98,7 +46,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
int Nblock; int Nblock;
BlockCGtype CGtype; BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -139,19 +87,10 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog); sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related // Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -171,20 +110,11 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R) const std::vector<Field> & R)
{ {
InnerProductMatrix(m_rr,R,R); InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
@@ -201,7 +131,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
} else if (CGtype == CGmultiRHS ) { } else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi); CGmultiRHSsolve(Linop,Src,Psi);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi)
@@ -209,7 +139,7 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
if ( CGtype == BlockCGrQVec ) { if ( CGtype == BlockCGrQVec ) {
BlockCGrQsolveVec(Linop,Src,Psi); BlockCGrQsolveVec(Linop,Src,Psi);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -256,13 +186,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog); sliceNorm(ssq,B,Orthog);
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog); sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog); sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -292,9 +221,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
tmp = B - AD; tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q; D=Q;
@@ -310,8 +236,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
RealD max_resid=0;
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations; k++) {
@@ -356,7 +280,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/ */
m_rr = m_C.adjoint() * m_C; m_rr = m_C.adjoint() * m_C;
max_resid=0; RealD max_resid=0;
RealD rrsum=0; RealD rrsum=0;
RealD rr; RealD rr;
@@ -398,11 +322,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
} }
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations if (ErrorOnNoConverge) assert(0);
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -438,10 +360,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog); sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog); sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess // Initial search dir is guess
Linop.HermOp(Psi, AP); Linop.HermOp(Psi, AP);
@@ -540,10 +462,47 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
} }
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl; std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation: // BlockCGrQvec implementation:
//-------------------------- //--------------------------
@@ -554,7 +513,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X)
{ {
Nblock = B.size(); Nblock = B.size();
GRID_ASSERT(Nblock == X.size()); assert(Nblock == X.size());
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
@@ -590,14 +549,13 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -627,7 +585,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) { for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]); Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b]; tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
} }
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
@@ -731,7 +688,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl; std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }

View File

@@ -36,7 +36,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when CAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -82,7 +82,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -137,7 +137,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -185,7 +185,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -38,14 +38,13 @@ NAMESPACE_BEGIN(Grid);
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -55,26 +54,11 @@ public:
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), : Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv) ErrorOnNoConverge(err_on_no_conv){};
{};
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient"); GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
@@ -82,32 +66,22 @@ public:
RealD cp, c, a, d, b, ssq, qq; RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred; //RealD b_pred;
// Was doing copies Field p(src);
ConstructTimer.Start(); Field mmp(src);
Field p (src.Grid()); Field r(src);
Field mmp(src.Grid());
Field r (src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up // Initial residual computation & set up
NormTimer.Start();
ssq = norm2(src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
NormTimer.Stop(); assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) {
r = src;
p = r;
a = ssq;
} else {
Linop.HermOpAndNorm(psi, mmp, d, b); Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
p = r; p = r;
a = norm2(p); a = norm2(p);
}
cp = a; cp = a;
AssignTimer.Stop(); ssq = norm2(src);
// Handle trivial case of zero src // Handle trivial case of zero src
if (ssq == 0.){ if (ssq == 0.){
@@ -137,7 +111,6 @@ public:
std::cout << GridLogIterative << std::setprecision(8) std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
PreambleTimer.Stop();
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
GridStopWatch InnerTimer; GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer; GridStopWatch AxpyNormTimer;
@@ -183,7 +156,6 @@ public:
} }
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
LogIteration(k,a,b);
IterationTimer.Stop(); IterationTimer.Stop();
if ( (k % 500) == 0 ) { if ( (k % 500) == 0 ) {
@@ -211,18 +183,17 @@ public:
<< "\tTrue residual " << true_residual << "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl; << "\tTarget " << Tolerance << std::endl;
// std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tSolver Elapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "Time breakdown "<<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
TrueResidual = true_residual; TrueResidual = true_residual;
@@ -231,143 +202,17 @@ public:
} }
} }
// Failed. Calculate true residual before giving up // Failed. Calculate true residual before giving up
// Linop.HermOpAndNorm(psi, mmp, d, qq); Linop.HermOpAndNorm(psi, mmp, d, qq);
// p = mmp - src; p = mmp - src;
//TrueResidual = sqrt(norm2(p)/ssq);
// TrueResidual = 1;
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations TrueResidual = sqrt(norm2(p)/ssq);
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage<< "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
} }
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@@ -77,7 +77,7 @@ public:
} }
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
GRID_ASSERT(src_d_in.size() == sol_d.size()); assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size(); int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;

View File

@@ -98,15 +98,15 @@ public:
std::vector<RealD> alpha(nshift,1.0); std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions std::vector<Field> ps(nshift,grid);// Search directions
GRID_ASSERT(psi.size()==nshift); assert(psi.size()==nshift);
GRID_ASSERT(mass.size()==nshift); assert(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// remove dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift); RealD bs[nshift];
std::vector<RealD> rsq(nshift); RealD rsq[nshift];
std::vector<std::array<RealD,2> > z(nshift); RealD z[nshift][2];
std::vector<int> converged(nshift); int converged[nshift];
const int primary =0; const int primary =0;
@@ -122,7 +122,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
GRID_ASSERT( mass[s]>= mass[primary] ); assert( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -144,7 +144,7 @@ public:
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s]; rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid^2 "<<rsq[s]<<std::endl; <<" target resid "<<rsq[s]<<std::endl;
ps[s] = src; ps[s] = src;
} }
// r and p for primary // r and p for primary
@@ -338,7 +338,7 @@ public:
} }
// ugly hack // ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// GRID_ASSERT(0); // assert(0);
} }
}; };

View File

@@ -118,16 +118,16 @@ public:
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
GRID_ASSERT(psi_d.size()==nshift); assert(psi_d.size()==nshift);
GRID_ASSERT(mass.size()==nshift); assert(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift); RealD bs[nshift];
std::vector<RealD> rsq(nshift); RealD rsq[nshift];
std::vector<RealD> rsqf(nshift); RealD rsqf[nshift];
std::vector<std::array<RealD,2> > z(nshift); RealD z[nshift][2];
std::vector<int> converged(nshift); int converged[nshift];
const int primary =0; const int primary =0;
@@ -141,7 +141,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
GRID_ASSERT( mass[s]>= mass[primary] ); assert( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -179,7 +179,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// GRID_ASSERT(norm2(tmp_d)< 1.0e-4); // assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -365,7 +365,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };

View File

@@ -48,12 +48,12 @@ public:
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){} ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); } void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); } void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); } void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ GRID_ASSERT(0); } void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ GRID_ASSERT(0); } void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out); linop_base.HermOp(in, out);
@@ -151,16 +151,16 @@ public:
FieldD r_d(DoublePrecGrid); FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
GRID_ASSERT(psi_d.size()==nshift); assert(psi_d.size()==nshift);
GRID_ASSERT(mass.size()==nshift); assert(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift); RealD bs[nshift];
std::vector<RealD> rsq(nshift); RealD rsq[nshift];
std::vector<RealD> rsqf(nshift); RealD rsqf[nshift];
std::vector<std::array<RealD,2> > z(nshift); RealD z[nshift][2];
std::vector<int> converged(nshift); int converged[nshift];
const int primary =0; const int primary =0;
@@ -174,7 +174,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
GRID_ASSERT( mass[s]>= mass[primary] ); assert( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -211,7 +211,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
GRID_ASSERT(norm2(tmp_d)< 1.0); assert(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -408,7 +408,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };

View File

@@ -35,7 +35,7 @@ template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public: public:
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -66,7 +66,7 @@ public:
DoFinalCleanup(true), DoFinalCleanup(true),
Linop_fallback(NULL) Linop_fallback(NULL)
{ {
GRID_ASSERT(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
}; };
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
@@ -90,7 +90,7 @@ public:
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b); Linop_d.HermOpAndNorm(psi, mmp, d, b);
@@ -217,7 +217,7 @@ public:
CG(Linop_d,src,psi); CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete; IterationsToCleanup = CG.IterationsToComplete;
} }
else if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0); else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n"; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return; return;
@@ -263,7 +263,7 @@ public:
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl; << std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
ReliableUpdatesPerformed = l; ReliableUpdatesPerformed = l;
} }

View File

@@ -106,7 +106,7 @@ public:
} }
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -69,8 +69,8 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N) DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N) : evec(_evec), eval(_eval), N(_N)
{ {
GRID_ASSERT(evec.size()==eval.size()); assert(evec.size()==eval.size());
GRID_ASSERT(N <= evec.size()); assert(N <= evec.size());
} }
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
@@ -141,7 +141,8 @@ public:
} }
//postprocessing //postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl; std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++) { for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl; std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace); blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard(); guess[j].Checkerboard() = src[j].Checkerboard();

View File

@@ -36,7 +36,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FCAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -87,7 +87,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -144,7 +144,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -191,7 +191,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -36,7 +36,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -85,7 +85,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -142,7 +142,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -189,7 +189,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -36,7 +36,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when GMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -80,7 +80,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -135,7 +135,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -181,7 +181,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -175,7 +175,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter), eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation),split_test(0), diagonalisation(_diagonalisation),split_test(0),
Nevec_acc(_Nu) Nevec_acc(_Nu)
{ GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); }; { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
//////////////////////////////// ////////////////////////////////
// Helpers // Helpers
@@ -206,7 +206,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl; Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
} }
} }
GRID_ASSERT(normalize(w,if_print) != 0); assert(normalize(w,if_print) != 0);
} }
void reorthogonalize(Field& w, std::vector<Field>& evec, int k) void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{ {
@@ -225,7 +225,7 @@ public:
w[i] = w[i] - ip * evec[j]; w[i] = w[i] - ip * evec[j];
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
GRID_ASSERT(normalize(w[i],if_print) !=0); assert(normalize(w[i],if_print) !=0);
} }
@@ -244,7 +244,7 @@ public:
const uint64_t sites = grid->lSites(); const uint64_t sites = grid->lSites();
int Nbatch = R/Nevec_acc; int Nbatch = R/Nevec_acc;
GRID_ASSERT( R%Nevec_acc == 0 ); assert( R%Nevec_acc == 0 );
// Glog << "nBatch, Nevec_acc, R, Nu = " // Glog << "nBatch, Nevec_acc, R, Nu = "
// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl; // << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
@@ -302,7 +302,7 @@ public:
} }
} }
for (int i=0; i<Nu; ++i) { for (int i=0; i<Nu; ++i) {
GRID_ASSERT(normalize(w[i],do_print)!=0); assert(normalize(w[i],do_print)!=0);
} }
Glog << "cuBLAS Zgemm done"<< std::endl; Glog << "cuBLAS Zgemm done"<< std::endl;
@@ -374,8 +374,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_irbl()"); std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
GRID_ASSERT(grid == src[0].Grid()); assert(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() ); assert( Nu = src.size() );
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -396,7 +396,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
GRID_ASSERT(Nm == evec.size() && Nm == eval.size()); assert(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -579,8 +579,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_rbl()"); std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
GRID_ASSERT(grid == src[0].Grid()); assert(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() ); assert( Nu = src.size() );
int Np = (Nm-Nk); int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter; if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -607,7 +607,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
GRID_ASSERT(Nm == evec.size() && Nm == eval.size()); assert(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -785,7 +785,7 @@ private:
int Nu = w.size(); int Nu = w.size();
int Nm = evec.size(); int Nm = evec.size();
GRID_ASSERT( b < Nm/Nu ); assert( b < Nm/Nu );
// GridCartesian *grid = evec[0]._grid; // GridCartesian *grid = evec[0]._grid;
// converts block index to full indicies for an interval [L,R) // converts block index to full indicies for an interval [L,R)
@@ -796,7 +796,7 @@ private:
Glog << "Using split grid"<< std::endl; Glog << "Using split grid"<< std::endl;
// LatticeGaugeField s_Umu(SGrid); // LatticeGaugeField s_Umu(SGrid);
GRID_ASSERT((Nu%mrhs)==0); assert((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid); std::vector<Field> in(mrhs,f_grid);
Field s_in(sf_grid); Field s_in(sf_grid);
@@ -906,7 +906,7 @@ if(split_test){
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl; // Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
GRID_ASSERT (!isnan(norm2(w[u]))); assert (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) { for (int k=L+u; k<R; ++k) {
Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
} }
@@ -929,8 +929,8 @@ if(split_test){
Eigen::MatrixXcd & Qt, // Nm x Nm Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid) GridBase *grid)
{ {
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -970,8 +970,8 @@ if(split_test){
GridBase *grid) GridBase *grid)
{ {
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl; Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -1119,7 +1119,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -1131,8 +1131,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk); M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange // rearrange
@@ -1159,8 +1159,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
// rearrange // rearrange
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {

View File

@@ -79,16 +79,14 @@ template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public Imp
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0); RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13); std::cout.precision(13);
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] " std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")" <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<" target " << eresid*eresid << " conv " <<conv
<<std::endl; <<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv; return conv;
} }
}; };
@@ -211,7 +209,7 @@ until convergence
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false) void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{ {
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
GRID_ASSERT(grid == evec[0].Grid()); assert(grid == evec[0].Grid());
// GridLogIRL.TimingMode(1); // GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -231,7 +229,7 @@ until convergence
} }
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
GRID_ASSERT(Nm <= evec.size() && Nm <= eval.size()); assert(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
@@ -245,10 +243,9 @@ until convergence
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0); // std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl; // std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = std::sqrt(vnum/vden); RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.0001) if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
@@ -256,7 +253,6 @@ until convergence
src_n = tmp; src_n = tmp;
} }
} }
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm); std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm); std::vector<RealD> lme2(Nm);
@@ -337,7 +333,7 @@ until convergence
} }
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl; std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
GRID_ASSERT(k2<Nm); GRID_ASSERT(k2<Nm); GRID_ASSERT(k1>0); assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl; std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
@@ -461,15 +457,15 @@ until convergence
std::vector<Field>& evec, std::vector<Field>& evec,
Field& w,int Nm,int k) Field& w,int Nm,int k)
{ {
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl; std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20; const RealD tiny = 1.0e-20;
GRID_ASSERT( k< Nm ); assert( k< Nm );
GridStopWatch gsw_op,gsw_o; GridStopWatch gsw_op,gsw_o;
Field& evec_k = evec[k]; Field& evec_k = evec[k];
_PolyOp(evec_k,w); std::cout<<GridLogDebug << "PolyOp" <<std::endl; _PolyOp(evec_k,w); std::cout<<GridLogIRL << "PolyOp" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1]; if(k>0) w -= lme[k-1] * evec[k-1];
@@ -484,18 +480,18 @@ until convergence
lme[k] = beta; lme[k] = beta;
if ( (k>0) && ( (k % orth_period) == 0 )) { if ( (k>0) && ( (k % orth_period) == 0 )) {
std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl; std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
orthogonalize(w,evec,k); // orthonormalise orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl; std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
} }
if(k < Nm-1) evec[k+1] = w; if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl; std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny ) if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl; std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl; std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
} }
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
@@ -597,7 +593,7 @@ until convergence
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) { } else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid); diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -687,7 +683,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
} }
} }
#else #else
GRID_ASSERT(0); assert(0);
#endif #endif
} }

View File

@@ -80,7 +80,7 @@ public:
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace) _Linop(linop), subspace(_subspace)
{ {
GRID_ASSERT(subspace.size() >0); assert(subspace.size() >0);
}; };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
@@ -346,12 +346,12 @@ public:
void testFine(RealD resid) void testFine(RealD resid)
{ {
GRID_ASSERT(evals_fine.size() == nbasis); assert(evals_fine.size() == nbasis);
GRID_ASSERT(subspace.size() == nbasis); assert(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op); ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){ for(int k=0;k<nbasis;k++){
GRID_ASSERT(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1); assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
} }
} }
@@ -359,8 +359,8 @@ public:
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{ {
GRID_ASSERT(evals_fine.size() == nbasis); assert(evals_fine.size() == nbasis);
GRID_ASSERT(subspace.size() == nbasis); assert(subspace.size() == nbasis);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -380,7 +380,7 @@ public:
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes) RealD MaxIt, RealD betastp, int MinRes)
{ {
GRID_ASSERT(nbasis<=Nm); assert(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms); Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp); FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
@@ -400,8 +400,8 @@ public:
IRL.calc(evals_fine,subspace,src,Nconv,false); IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved // Shrink down to number saved
GRID_ASSERT(Nstop>=nbasis); assert(Nstop>=nbasis);
GRID_ASSERT(Nconv>=nbasis); assert(Nconv>=nbasis);
evals_fine.resize(nbasis); evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid); subspace.resize(nbasis,_FineGrid);
} }
@@ -433,7 +433,7 @@ public:
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0; int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
GRID_ASSERT(Nconv>=Nstop); assert(Nconv>=Nstop);
evals_coarse.resize(Nstop); evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid); evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){ for (int i=0;i<Nstop;i++){

View File

@@ -35,7 +35,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the MR fails to converge. bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -59,7 +59,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
@@ -136,7 +136,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(true_residual / Tolerance < 10000.0); assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
@@ -148,7 +148,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
<< std::endl; << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }

View File

@@ -37,7 +37,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
using OperatorFunction<FieldD>::operator(); using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when MPFGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -91,7 +91,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -150,7 +150,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl; std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
@@ -197,7 +197,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver // Take a matrix and form an NE solver calling a Herm solver
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public LinearFunction<Field>{ template<class Field> class NormalEquations {
private: private:
SparseMatrixBase<Field> & _Matrix; SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;
@@ -60,33 +60,7 @@ public:
} }
}; };
template<class Field> class NormalResidual : public LinearFunction<Field>{ template<class Field> class HPDSolver {
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
Field res(in.Grid());
Field tmp(in.Grid());
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> {
private: private:
LinearOperatorBase<Field> & _Matrix; LinearOperatorBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;
@@ -104,13 +78,13 @@ public:
void operator() (const Field &in, Field &out){ void operator() (const Field &in, Field &out){
_Guess(in,out); _Guess(in,out);
_HermitianSolver(_Matrix,in,out); //M out = in _HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in
} }
}; };
template<class Field> class MdagMSolver : public LinearFunction<Field> { template<class Field> class MdagMSolver {
private: private:
SparseMatrixBase<Field> & _Matrix; SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;

View File

@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
const int _MAX_ITER_EST_ = 200; const int _MAX_ITER_EST_ = 50;
for (int i=0;i<_MAX_ITER_EST_;i++) { for (int i=0;i<_MAX_ITER_EST_;i++) {
@@ -30,17 +30,18 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl; std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) { if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
// evalMaxApprox = na;
// return evalMaxApprox;
// }
evalMaxApprox = na; evalMaxApprox = na;
src_n = tmp;
}
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox; return evalMaxApprox;
} }
evalMaxApprox = na;
src_n = tmp;
}
assert(0);
return 0;
}
}; };
} }

View File

@@ -1,76 +0,0 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}

View File

@@ -112,7 +112,7 @@ public:
} }
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -118,7 +118,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// GRID_ASSERT(0); // assert(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -221,7 +221,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0); int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -231,7 +231,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
GRID_ASSERT(0); // never reached assert(0); // never reached
return cp; return cp;
} }
}; };

View File

@@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
// psi=Zero(); psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;
@@ -113,7 +113,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// GRID_ASSERT(0); // assert(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -224,7 +224,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0); int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -234,7 +234,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
GRID_ASSERT(0); // never reached assert(0); // never reached
return cp; return cp;
} }
}; };

View File

@@ -79,7 +79,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
LinOp.Op(x,r); r = b - r; LinOp.Op(x,r); r = b - r;
GRID_ASSERT(normb> 0.0); assert(normb> 0.0);
resid = norm2(r)/normb; resid = norm2(r)/normb;
if (resid <= Tolerance) { if (resid <= Tolerance) {
@@ -105,8 +105,8 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
for (int i = 1; i <= MaxIterations; i++) { for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests // Breakdown tests
GRID_ASSERT( rho != 0.0); assert( rho != 0.0);
GRID_ASSERT( xi != 0.0); assert( xi != 0.0);
v = (1. / rho) * v_tld; v = (1. / rho) * v_tld;
y = (1. / rho) * y; y = (1. / rho) * y;
@@ -134,10 +134,10 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
ep=Zep.real(); ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl; std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit // Complex Audit
GRID_ASSERT(abs(ep)>0); assert(abs(ep)>0);
beta = ep / delta; beta = ep / delta;
GRID_ASSERT(abs(beta)>0); assert(abs(beta)>0);
v_tld = p_tld - beta * v; v_tld = p_tld - beta * v;
y = v_tld; y = v_tld;
@@ -158,7 +158,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
std::cout << "theta "<<theta<<std::endl; std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl; std::cout << "gamma "<<gamma<<std::endl;
GRID_ASSERT(abs(gamma)> 0.0); assert(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1); eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
@@ -178,7 +178,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
} }
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl; std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
} }
GRID_ASSERT(0); assert(0);
return; // no convergence return; // no convergence
} }
#else #else

View File

@@ -327,9 +327,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e) // src_o = (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
} }
@@ -347,17 +347,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even); src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix); SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -396,13 +396,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -416,17 +416,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; GRID_ASSERT( src_e_i.Checkerboard() ==Even); src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -461,9 +461,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd ); src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -478,18 +478,18 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; GRID_ASSERT( src_e_i.Checkerboard() == Even ); src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); GRID_ASSERT( sol_o.Checkerboard() == Odd ); setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{ {
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix); NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); GRID_ASSERT(sol_o.Checkerboard() == Odd); this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd);
} }
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
@@ -499,87 +499,6 @@ namespace Grid {
} }
}; };
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, left preconditioned by Mee^inv
// ( 1 - Mee^inv Meo Moo^inv Moe ) phi = Mee_inv ( Mee - Meo Moo^inv Moe Mee^inv ) phi = Mee_inv eta
//
// Solve:
// ( 1 - Mee^inv Meo Moo^inv Moe )^dag ( 1 - Mee^inv Meo Moo^inv Moe ) phi = ( 1 - Mee^inv Meo Moo^inv Moe )^dag Mee_inv eta
//
// Old notation e<->o
//
// Left precon by Moo^-1
// b) (Doo^{dag} M_oo^-dag) (Moo^-1 Doo) psi_o = [ (D_oo)^dag M_oo^-dag ] Moo^-1 L^{-1} eta_o
// eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagOneSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagOneSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); GRID_ASSERT( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagOneOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv // Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta // ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
@@ -612,12 +531,12 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -638,12 +557,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even); tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -684,9 +603,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd ); src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -707,12 +626,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even );
tmp = src_e - tmp; GRID_ASSERT( src_e.Checkerboard() == Even ); tmp = src_e - tmp; assert( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() == Odd ); setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd );
}; };
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)

View File

@@ -1,608 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/Aggregates.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x)
{
// return std::pow(x,-4);
// return std::pow(x,-3);
return std::pow(x,-5);
}
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
constexpr int Nbasis(void) { return nbasis; };
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
virtual void CreateSubspaceRandom(GridParallelRNG &RNG) {
int nn=nbasis;
RealD scale;
FineField noise(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
subspace[b] = noise;
}
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
RealD scale;
ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
<<ordermin<<" step "<<orderstep
<<" lo"<<filterlo<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
ComplexD ip;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
// Generate a full sequence of Chebyshevs
{
lo=filterlo;
noise=Mn;
FineField T0(FineGrid); T0 = noise;
FineField T1(FineGrid);
FineField T2(FineGrid);
FineField y(FineGrid);
FineField *Tnm = &T0;
FineField *Tn = &T1;
FineField *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
hermop.HermOp(T0,y);
T1=y*xscale+noise*mscale;
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
hermop.HermOp(*Tn,y);
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
// Possible more fine grained control is needed than a linear sweep,
// but huge productivity gain if this is simple algorithm and not a tunable
int m =1;
if ( n>=ordermin ) m=n-ordermin;
if ( (m%orderstep)==0 ) {
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
ComplexD ip;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
// Cycle pointers to avoid copies
FineField *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
GRID_ASSERT(b==nn);
}
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : nbasis "<<nn<<std::endl;
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
// Refine
Chebyshev<FineField> PowerLaw(lo,hi,1000,AggregatePowerLaw);
noise = Mn;
PowerLaw(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
// normalise
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void CreateSubspaceChebyshevPowerLaw(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
int orderfilter
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" [0,"<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : nbasis "<<nn<<std::endl;
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
Chebyshev<FineField> Cheb(0.0,hi,orderfilter,AggregatePowerLaw);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void CreateSubspaceChebyshevNew(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
double hi
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
//#opt2(x) = acheb(x,3,90,300)* acheb(x,1,90,50) * acheb(x,0.5,90,200) * acheb(x,0.05,90,400) * acheb(x,0.01,90,1500)
/*266
Chebyshev<FineField> Cheb1(3.0,hi,300);
Chebyshev<FineField> Cheb2(1.0,hi,50);
Chebyshev<FineField> Cheb3(0.5,hi,300);
Chebyshev<FineField> Cheb4(0.05,hi,500);
Chebyshev<FineField> Cheb5(0.01,hi,2000);
*/
/* 242 */
/*
Chebyshev<FineField> Cheb3(0.1,hi,300);
Chebyshev<FineField> Cheb2(0.02,hi,1000);
Chebyshev<FineField> Cheb1(0.003,hi,2000);
8?
*/
/* How many??
*/
Chebyshev<FineField> Cheb2(0.001,hi,2500); // 169 iters on HDCG after refine
Chebyshev<FineField> Cheb1(0.02,hi,600);
// Chebyshev<FineField> Cheb2(0.001,hi,1500);
// Chebyshev<FineField> Cheb1(0.02,hi,600);
Cheb1(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb1 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
Cheb2(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb2 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb3(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb3 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb4(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb4 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
// Cheb5(hermop,noise,Mn); scale = std::pow(norm2(Mn),-0.5); noise=Mn*scale;
// hermop.Op(noise,tmp); std::cout<<GridLogMessage << "Cheb5 <n|MdagM|n> "<<norm2(tmp)<<std::endl;
subspace[b] = noise;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<< " norm " << norm2(noise)<<std::endl;
}
}
virtual void CreateSubspaceMultishift(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
double Lo,double tol,int maxit)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Multishift subspace : Lo "<<Lo<<std::endl;
// Filter
// [ 1/6(x+Lo) - 1/2(x+2Lo) + 1/2(x+3Lo) -1/6(x+4Lo) = Lo^3 /[ (x+1Lo)(x+2Lo)(x+3Lo)(x+4Lo) ]
//
// 1/(x+Lo) - 1/(x+2 Lo)
double epsilon = Lo/3;
std::vector<RealD> alpha({1.0/6.0,-1.0/2.0,1.0/2.0,-1.0/6.0});
std::vector<RealD> shifts({Lo,Lo+epsilon,Lo+2*epsilon,Lo+3*epsilon});
std::vector<RealD> tols({tol,tol,tol,tol});
std::cout << "sizes "<<alpha.size()<<" "<<shifts.size()<<" "<<tols.size()<<std::endl;
MultiShiftFunction msf(4,0.0,95.0);
std::cout << "msf constructed "<<std::endl;
msf.poles=shifts;
msf.residues=alpha;
msf.tolerances=tols;
msf.norm=0.0;
msf.order=alpha.size();
ConjugateGradientMultiShift<FineField> MSCG(maxit,msf);
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
MSCG(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void RefineSubspace(LinearOperatorBase<FineField> &hermop,
double Lo,double tol,int maxit)
{
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b++)
{
ConjugateGradient<FineField> CGsloppy(tol,maxit,false);
ShiftedHermOpLinearOperator<FineField> ShiftedFineHermOp(hermop,Lo);
tmp=Zero();
CGsloppy(hermop,subspace[b],tmp);
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b]=tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
virtual void RefineSubspaceHDCG(LinearOperatorBase<FineField> &hermop,
TwoLevelADEF2mrhs<FineField,CoarseVector> & theHDCG,
int nrhs)
{
std::vector<FineField> src_mrhs(nrhs,FineGrid);
std::vector<FineField> res_mrhs(nrhs,FineGrid);
FineField tmp(FineGrid);
for(int b =0;b<nbasis;b+=nrhs)
{
tmp = subspace[b];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b] =tmp;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "before filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
src_mrhs[r] = subspace[b+r];
}
for(int r=0;r<nrhs;r++){
res_mrhs[r] = Zero();
}
theHDCG(src_mrhs,res_mrhs);
for(int r=0;r<MIN(nbasis-b,nrhs);r++){
tmp = res_mrhs[r];
RealD scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
subspace[b+r]=tmp;
}
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "after filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
};
NAMESPACE_END(Grid);

View File

@@ -1,629 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
NAMESPACE_BEGIN(Grid);
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
typedef iVector<CComplex,nbasis > siteVector;
typedef iMatrix<CComplex,nbasis > siteMatrix;
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef iVector<CComplex,nbasis > Cvec;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef Lattice<CComplex > FineComplexField;
typedef CoarseVector Field;
////////////////////
// Data members
////////////////////
int hermitian;
GridBase * _FineGrid;
GridCartesian * _CoarseGrid;
NonLocalStencilGeometry &geom;
PaddedCell Cell;
GeneralLocalStencil Stencil;
std::vector<CoarseMatrix> _A;
std::vector<CoarseMatrix> _Adag;
std::vector<CoarseVector> MultTemporaries;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know
GridBase * FineGrid(void) { return _FineGrid; }; // this is all the linalg routines need to know
GridCartesian * CoarseGrid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know
/* void ShiftMatrix(RealD shift)
{
int Nd=_FineGrid->Nd();
Coordinate zero_shift(Nd,0);
for(int p=0;p<geom.npoint;p++){
if ( zero_shift==geom.shifts[p] ) {
_A[p] = _A[p]+shift;
// _Adag[p] = _Adag[p]+shift;
}
}
}
void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
{
int nfound=0;
std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
for(int p=0;p<geom.npoint;p++){
for(int pp=0;pp<CopyMe.geom.npoint;pp++){
// Search for the same relative shift
// Avoids brutal handling of Grid pointers
if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
_A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
// _Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
nfound++;
}
}
}
GRID_ASSERT(nfound==geom.npoint);
ExchangeCoarseLinks();
}
*/
GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
: geom(_geom),
_FineGrid(FineGrid),
_CoarseGrid(CoarseGrid),
hermitian(1),
Cell(_geom.Depth(),_CoarseGrid),
Stencil(Cell.grids.back(),geom.shifts)
{
{
int npoint = _geom.npoint;
}
_A.resize(geom.npoint,CoarseGrid);
// _Adag.resize(geom.npoint,CoarseGrid);
}
void M (const CoarseVector &in, CoarseVector &out)
{
Mult(_A,in,out);
}
void Mdag (const CoarseVector &in, CoarseVector &out)
{
GRID_ASSERT(hermitian);
Mult(_A,in,out);
// if ( hermitian ) M(in,out);
// else Mult(_Adag,in,out);
}
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
{
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
RealD tmult2=0;
ttot=-usecond();
conformable(CoarseGrid(),in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
CoarseVector tin=in;
texch-=usecond();
CoarseVector pin = Cell.ExchangePeriodic(tin);
texch+=usecond();
CoarseVector pout(pin.Grid());
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
typedef LatticeView<Cvec> Vview;
const int Nsimd = CComplex::Nsimd();
int64_t osites=pin.Grid()->oSites();
RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
+ 2.0*osites*sizeof(siteVector)*npoint;
{
tviews-=usecond();
autoView( in_v , pin, AcceleratorRead);
autoView( out_v , pout, AcceleratorWriteDiscard);
autoView( Stencil_v , Stencil, AcceleratorRead);
tviews+=usecond();
// Static and prereserve to keep UVM region live and not resized across multiple calls
ttemps-=usecond();
MultTemporaries.resize(npoint,pin.Grid());
ttemps+=usecond();
std::vector<Aview> AcceleratorViewContainer_h;
std::vector<Vview> AcceleratorVecViewContainer_h;
tviews-=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h.push_back( A[p].View(AcceleratorRead));
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
}
tviews+=usecond();
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
auto Aview_p = &AcceleratorViewContainer[0];
auto Vview_p = &AcceleratorVecViewContainer[0];
tcopy-=usecond();
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
tcopy+=usecond();
tmult-=usecond();
accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int32_t ss = spb/(nbasis*npoint);
int32_t bp = spb%(nbasis*npoint);
int32_t point= bp/nbasis;
int32_t b = bp%nbasis;
auto SE = Stencil_v.GetEntry(point,ss);
auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
auto res = coalescedRead(Aview_p[point][ss](0,b))*nbr(0);
for(int bb=1;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](bb,b))*nbr(bb);
}
coalescedWrite(Vview_p[point][ss](b),res);
});
tmult2-=usecond();
accelerator_for(sb, osites*nbasis, Nsimd, {
int ss = sb/nbasis;
int b = sb%nbasis;
auto res = coalescedRead(Vview_p[0][ss](b));
for(int point=1;point<npoint;point++){
res = res + coalescedRead(Vview_p[point][ss](b));
}
coalescedWrite(out_v[ss](b),res);
});
tmult2+=usecond();
tmult+=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h[p].ViewClose();
AcceleratorVecViewContainer_h[p].ViewClose();
}
}
text-=usecond();
out = Cell.Extract(pout);
text+=usecond();
ttot+=usecond();
std::cout << GridLogPerformance<<"Coarse 1rhs Mult Aviews "<<tviews<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
std::cout << GridLogPerformance<<" of which mult2 "<<tmult2<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult ext "<<text<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
// std::cout << GridLogPerformance<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel flops "<< flops<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Kernel bytes/s "<< bytes/tmult<<" MB/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
void PopulateAdag(void)
{
for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
Coordinate bcoor;
CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
for(int p=0;p<geom.npoint;p++){
Coordinate scoor = bcoor;
for(int mu=0;mu<bcoor.size();mu++){
int L = CoarseGrid()->GlobalDimensions()[mu];
scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
}
// Flip to poke/peekLocalSite and not too bad
auto link = peekSite(_A[p],scoor);
int pp = geom.Reverse(p);
pokeSite(adj(link),_Adag[pp],bcoor);
}
}
}
/////////////////////////////////////////////////////////////
//
// A) Only reduced flops option is to use a padded cell of depth 4
// and apply MpcDagMpc in the padded cell.
//
// Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
// With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
// Cost is 81x more, same as stencil size.
//
// But: can eliminate comms and do as local dirichlet.
//
// Local exchange gauge field once.
// Apply to all vectors, local only computation.
// Must exchange ghost subcells in reverse process of PaddedCell to take inner products
//
// B) Can reduce cost: pad by 1, apply Deo (4^4+6^4+8^4+8^4 )/ (4x 4^4)
// pad by 2, apply Doe
// pad by 3, apply Deo
// then break out 8x directions; cost is ~10x MpcDagMpc per vector
//
// => almost factor of 10 in setup cost, excluding data rearrangement
//
// Intermediates -- ignore the corner terms, leave approximate and force Hermitian
// Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
/////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////
// BFM HDCG style approach: Solve a system of equations to get Aij
//////////////////////////////////////////////////////////
/*
* Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
*
* conj(phases[block]) proj[k][ block*Nvec+j ] = \sum_ball e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} >
* = \sum_ball e^{iqk.delta} A_ji
*
* Must invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*/
#if 0
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
RealD teigen=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tinv=0.0;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
int Nd = CoarseGrid()->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
teigen-=usecond();
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
teigen+=usecond();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
CoarseVector coarseInner(CoarseGrid());
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
std::vector<CoarseVector> FT(npoint,CoarseGrid());
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
tphase-=usecond();
CoarseComplexField coor(CoarseGrid());
CoarseComplexField pha(CoarseGrid()); pha=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
}
pha =exp(pha*ci);
phaV=Zero();
blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
tphase+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
}
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid()->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT[k], AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
}
// Need to write something to populate Adag from A
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
RealD teigen=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tphaseBZ=0.0;
RealD tinv=0.0;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
int Nd = CoarseGrid()->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
teigen-=usecond();
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
teigen+=usecond();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid());
CoarseVector coarseInner(CoarseGrid());
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
tphase=-usecond();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid());
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
tphase+=usecond();
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
std::vector<CoarseVector> FT(npoint,CoarseGrid());
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
// std::cout << i << " " <<p << " ComputeProj "<<norm2(ComputeProj[p])<<std::endl;
}
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid()->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT[k], AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
}
for(int p=0;p<geom.npoint;p++){
std::cout << " _A["<<p<<"] "<<norm2(_A[p])<<std::endl;
}
// Need to write something to populate Adag from A
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#endif
void ExchangeCoarseLinks(void){
for(int p=0;p<geom.npoint;p++){
_A[p] = Cell.ExchangePeriodic(_A[p]);
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
}
}
virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
NAMESPACE_END(Grid);

View File

@@ -1,729 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrixMultiRHS.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class MultiGeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef typename CComplex::scalar_object SComplex;
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
typedef MultiGeneralCoarsenedMatrix<Fobj,CComplex,nbasis> MultiGeneralCoarseOp;
typedef iVector<CComplex,nbasis > siteVector;
typedef iMatrix<CComplex,nbasis > siteMatrix;
typedef iVector<SComplex,nbasis > calcVector;
typedef iMatrix<SComplex,nbasis > calcMatrix;
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef iVector<CComplex,nbasis > Cvec;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef Lattice<CComplex > FineComplexField;
typedef CoarseVector Field;
////////////////////
// Data members
////////////////////
GridCartesian * _CoarseGridMulti;
NonLocalStencilGeometry geom;
NonLocalStencilGeometry geom_srhs;
PaddedCell Cell;
GeneralLocalStencil Stencil;
deviceVector<calcVector> BLAS_B;
deviceVector<calcVector> BLAS_C;
std::vector<deviceVector<calcMatrix> > BLAS_A;
std::vector<deviceVector<ComplexD *> > BLAS_AP;
std::vector<deviceVector<ComplexD *> > BLAS_BP;
deviceVector<ComplexD *> BLAS_CP;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
GridCartesian * CoarseGrid(void) { return _CoarseGridMulti; }; // this is all the linalg routines need to know
// Can be used to do I/O on the operator matrices externally
void SetMatrix (int p,CoarseMatrix & A)
{
GRID_ASSERT(A.size()==geom_srhs.npoint);
GridtoBLAS(A[p],BLAS_A[p]);
}
void GetMatrix (int p,CoarseMatrix & A)
{
GRID_ASSERT(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]);
}
void CopyMatrix (GeneralCoarseOp &_Op)
{
for(int p=0;p<geom.npoint;p++){
auto Aup = _Op.Cell.Extract(_Op._A[p]);
//Unpadded
GridtoBLAS(Aup,BLAS_A[p]);
}
}
/*
void CheckMatrix (GeneralCoarseOp &_Op)
{
std::cout <<"************* Checking the little direc operator mRHS"<<std::endl;
for(int p=0;p<geom.npoint;p++){
//Unpadded
auto Aup = _Op.Cell.Extract(_Op._A[p]);
auto Ack = Aup;
BLAStoGrid(Ack,BLAS_A[p]);
std::cout << p<<" Ack "<<norm2(Ack)<<std::endl;
std::cout << p<<" Aup "<<norm2(Aup)<<std::endl;
}
std::cout <<"************* "<<std::endl;
}
*/
MultiGeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridCartesian *CoarseGridMulti) :
_CoarseGridMulti(CoarseGridMulti),
geom_srhs(_geom),
geom(_CoarseGridMulti,_geom.hops,_geom.skip+1),
Cell(geom.Depth(),_CoarseGridMulti),
Stencil(Cell.grids.back(),geom.shifts) // padded cell stencil
{
int32_t padded_sites = Cell.grids.back()->lSites();
int32_t unpadded_sites = CoarseGridMulti->lSites();
int32_t nrhs = CoarseGridMulti->FullDimensions()[0]; // # RHS
int32_t orhs = nrhs/CComplex::Nsimd();
padded_sites = padded_sites/nrhs;
unpadded_sites = unpadded_sites/nrhs;
/////////////////////////////////////////////////
// Device data vector storage
/////////////////////////////////////////////////
BLAS_A.resize(geom.npoint);
for(int p=0;p<geom.npoint;p++){
BLAS_A[p].resize (unpadded_sites); // no ghost zone, npoint elements
}
BLAS_B.resize(nrhs *padded_sites); // includes ghost zone
BLAS_C.resize(nrhs *unpadded_sites); // no ghost zone
BLAS_AP.resize(geom.npoint);
BLAS_BP.resize(geom.npoint);
for(int p=0;p<geom.npoint;p++){
BLAS_AP[p].resize(unpadded_sites);
BLAS_BP[p].resize(unpadded_sites);
}
BLAS_CP.resize(unpadded_sites);
/////////////////////////////////////////////////
// Pointers to data
/////////////////////////////////////////////////
// Site identity mapping for A
for(int p=0;p<geom.npoint;p++){
for(int ss=0;ss<unpadded_sites;ss++){
ComplexD *ptr = (ComplexD *)&BLAS_A[p][ss];
acceleratorPut(BLAS_AP[p][ss],ptr);
}
}
// Site identity mapping for C
for(int ss=0;ss<unpadded_sites;ss++){
ComplexD *ptr = (ComplexD *)&BLAS_C[ss*nrhs];
acceleratorPut(BLAS_CP[ss],ptr);
}
// Neighbour table is more complicated
int32_t j=0; // Interior point counter (unpadded)
for(int32_t s=0;s<padded_sites;s++){ // 4 volume, padded
int ghost_zone=0;
for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point;
if( Stencil._entries[i]._wrap ) { // stencil is indexed by the oSite of the CoarseGridMulti, hence orhs factor
ghost_zone=1; // If general stencil wrapped in any direction, wrap=1
}
}
if( ghost_zone==0) {
for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point;
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
GRID_ASSERT(nbr<BLAS_B.size());
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
}
j++;
}
}
GRID_ASSERT(j==unpadded_sites);
}
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *Fg = from.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded);
int nd = Fg->_ndimension;
to.resize(Fg->lSites());
Coordinate LocalLatt = Fg->LocalDimensions();
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
////////////////////////////////////////////////////////////////////////////////////////////////
// do the index calc on the GPU
////////////////////////////////////////////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
autoView(from_v,from,AcceleratorRead);
auto to_v = &to[0];
const int words=sizeof(vobj)/sizeof(vector_type);
accelerator_for(idx,nsite,1,{
Coordinate from_coor, base;
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
for(int i=0;i<nd;i++){
from_coor[i] = base[i];
}
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx];
scalar_type* to = (scalar_type *)&to_v[idx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
to[w] = stmp;
}
});
}
template<class vobj> void BLAStoGrid(Lattice<vobj> &grid,deviceVector<typename vobj::scalar_object> &in)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *Tg = grid.Grid();
GRID_ASSERT(!Tg->_isCheckerBoarded);
int nd = Tg->_ndimension;
GRID_ASSERT(in.size()==Tg->lSites());
Coordinate LocalLatt = Tg->LocalDimensions();
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= LocalLatt[i];
////////////////////////////////////////////////////////////////////////////////////////////////
// do the index calc on the GPU
////////////////////////////////////////////////////////////////////////////////////////////////
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
autoView(to_v,grid,AcceleratorWrite);
auto from_v = &in[0];
const int words=sizeof(vobj)/sizeof(vector_type);
accelerator_for(idx,nsite,1,{
Coordinate to_coor, base;
Lexicographic::CoorFromIndex(base,idx,LocalLatt);
for(int i=0;i<nd;i++){
to_coor[i] = base[i];
}
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type* from = (scalar_type *)&from_v[idx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp=from[w];
putlane(to[w], stmp, to_lane);
}
});
}
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace,
GridBase *CoarseGrid)
{
#if 0
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid);
blockOrthogonalise(InnerProd,Subspace.subspace);
const int npoint = geom_srhs.npoint;
Coordinate clatt = CoarseGrid->GlobalDimensions();
int Nd = CoarseGrid->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
CoarseVector coarseInner(CoarseGrid);
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid);
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
CoarseVector FT(CoarseGrid);
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
phaV = phaF[p]*Subspace.subspace[i];
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
linop.Op(phaV,MphaV);
// Fixme, could use batched block projector here
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
for(int k=0;k<npoint;k++){
FT = Zero();
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT, AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
}
// Only needed if nonhermitian
// if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
// }
// Need to write something to populate Adag from A
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
/*
Grid : Message : 11698.730546 s : CoarsenOperator eigen 1334 us
Grid : Message : 11698.730563 s : CoarsenOperator phase 34729 us
Grid : Message : 11698.730565 s : CoarsenOperator phaseBZ 2423814 us
Grid : Message : 11698.730566 s : CoarsenOperator mat 127890998 us
Grid : Message : 11698.730567 s : CoarsenOperator proj 515840840 us
Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us
Takes 600s to compute matrix elements, DOMINATED by the block project.
Easy to speed up with the batched block project.
Store npoint vectors, get npoint x Nbasis block projection, and 81 fold faster.
// Block project below taks to 240s
Grid : Message : 328.193418 s : CoarsenOperator phase 38338 us
Grid : Message : 328.193434 s : CoarsenOperator phaseBZ 1711226 us
Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
//Grid : Message : 328.193438 s : CoarsenOperator proj 1181154 us <-- this is mistimed
//Grid : Message : 11698.730568 s : CoarsenOperator inv 103948313 us <-- Cut this ~10x if lucky by loop fusion
*/
#else
RealD tproj=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tphaseBZ=0.0;
RealD tinv=0.0;
std::cout << GridLogMessage<< "GeneralCoarsenMatrixMrhs "<< std::endl;
GridBase *grid = Subspace.FineGrid;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid);
blockOrthogonalise(InnerProd,Subspace.subspace);
MultiRHSBlockProject<Lattice<Fobj> > Projector;
Projector.Allocate(nbasis,grid,CoarseGrid);
Projector.ImportBasis(Subspace.subspace);
const int npoint = geom_srhs.npoint;
Coordinate clatt = CoarseGrid->GlobalDimensions();
int Nd = CoarseGrid->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom_srhs.shifts[k][mu]*geom_srhs.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
std::vector<FineComplexField> phaF(npoint,grid);
std::vector<CoarseComplexField> pha(npoint,CoarseGrid);
CoarseVector coarseInner(CoarseGrid);
tphase=-usecond();
typedef typename CComplex::scalar_type SComplex;
FineComplexField one(grid); one=SComplex(1.0);
FineComplexField zz(grid); zz = Zero();
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
CoarseComplexField coor(CoarseGrid);
pha[p]=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha[p] = pha[p] + (TwoPiL * geom_srhs.shifts[p][mu]) * coor;
}
pha[p] =exp(pha[p]*ci);
blockZAXPY(phaF[p],pha[p],one,zz);
}
tphase+=usecond();
// Could save on temporary storage here
std::vector<CoarseMatrix> _A;
_A.resize(geom_srhs.npoint,CoarseGrid);
// Count use small chunks than npoint == 81 and save memory
int batch = 9;
std::vector<FineField> _MphaV(batch,grid);
std::vector<CoarseVector> TmpProj(batch,CoarseGrid);
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid);
CoarseVector FT(CoarseGrid);
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
// std::cout << GridLogMessage << " phasing the fine vector "<<std::endl;
// Fixme : do this in batches
for(int p=0;p<npoint;p+=batch){ // Loop over momenta in npoint
for(int b=0;b<MIN(batch,npoint-p);b++){
tphaseBZ-=usecond();
phaV = phaF[p+b]*Subspace.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
// Memory footprint was an issue
tmat-=usecond();
linop.Op(phaV,MphaV);
_MphaV[b] = MphaV;
tmat+=usecond();
}
// std::cout << GridLogMessage << " Calling block project "<<std::endl;
tproj-=usecond();
Projector.blockProject(_MphaV,TmpProj);
tproj+=usecond();
// std::cout << GridLogMessage << " conj phasing the coarse vectors "<<std::endl;
for(int b=0;b<MIN(batch,npoint-p);b++){
ComputeProj[p+b] = conjugate(pha[p+b])*TmpProj[b];
}
}
// Could do this with a block promote or similar BLAS call via the MultiRHSBlockProjector with a const matrix.
// std::cout << GridLogMessage << " Starting FT inv "<<std::endl;
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT = Zero();
// 81 kernel calls as many ComputeProj vectors
// Could fuse with a vector of views, but ugly
// Could unroll the expression and run fewer kernels -- much more attractive
// Could also do non blocking.
#if 0
for(int l=0;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#else
const int radix = 9;
int ll;
for(ll=0;ll+radix-1<npoint;ll+=radix){
// When ll = npoint-radix, ll+radix-1 = npoint-1, and we do it all.
FT = FT
+ invMkl(ll+0,k)*ComputeProj[ll+0]
+ invMkl(ll+1,k)*ComputeProj[ll+1]
+ invMkl(ll+2,k)*ComputeProj[ll+2]
+ invMkl(ll+3,k)*ComputeProj[ll+3]
+ invMkl(ll+4,k)*ComputeProj[ll+4]
+ invMkl(ll+5,k)*ComputeProj[ll+5]
+ invMkl(ll+6,k)*ComputeProj[ll+6]
+ invMkl(ll+7,k)*ComputeProj[ll+7]
+ invMkl(ll+8,k)*ComputeProj[ll+8];
}
for(int l=ll;l<npoint;l++){
FT= FT+ invMkl(l,k)*ComputeProj[l];
}
#endif
// 1 kernel call -- must be cheaper
int osites=CoarseGrid->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT, AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](i,j) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
// Only needed if nonhermitian
// if ( ! hermitian ) {
// std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
// PopulateAdag();
// }
// Need to write something to populate Adag from A
// std::cout << GridLogMessage << " Calling GridtoBLAS "<<std::endl;
for(int p=0;p<geom_srhs.npoint;p++){
GridtoBLAS(_A[p],BLAS_A[p]);
}
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phaseBZ "<<tphaseBZ<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
#endif
}
void Mdag(const CoarseVector &in, CoarseVector &out)
{
this->M(in,out);
}
void M (const CoarseVector &in, CoarseVector &out)
{
// std::cout << GridLogMessage << "New Mrhs coarse"<<std::endl;
conformable(CoarseGrid(),in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
RealD t_tot;
RealD t_exch;
RealD t_GtoB;
RealD t_BtoG;
RealD t_mult;
t_tot=-usecond();
CoarseVector tin=in;
t_exch=-usecond();
CoarseVector pin = Cell.ExchangePeriodic(tin); //padded input
t_exch+=usecond();
CoarseVector pout(pin.Grid());
int npoint = geom.npoint;
typedef calcMatrix* Aview;
typedef LatticeView<Cvec> Vview;
const int Nsimd = CComplex::Nsimd();
int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
GRID_ASSERT(nrhs>=1);
RealD flops,bytes;
int64_t osites=in.Grid()->oSites(); // unpadded
int64_t unpadded_vol = CoarseGrid()->lSites()/nrhs;
flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
bytes = 1.0*osites*sizeof(siteMatrix)*npoint/pin.Grid()->GlobalDimensions()[0]
+ 2.0*osites*sizeof(siteVector)*npoint;
t_GtoB=-usecond();
GridtoBLAS(pin,BLAS_B);
t_GtoB+=usecond();
GridBLAS BLAS;
t_mult=-usecond();
for(int p=0;p<geom.npoint;p++){
RealD c = 1.0;
if (p==0) c = 0.0;
ComplexD beta(c);
BLAS.gemmBatched(nbasis,nrhs,nbasis,
ComplexD(1.0),
BLAS_AP[p],
BLAS_BP[p],
ComplexD(c),
BLAS_CP);
}
BLAS.synchronise();
t_mult+=usecond();
t_BtoG=-usecond();
BLAStoGrid(out,BLAS_C);
t_BtoG+=usecond();
t_tot+=usecond();
/*
std::cout << GridLogMessage << "New Mrhs coarse DONE "<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult exch "<<t_exch<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult mult "<<t_mult<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult GtoB "<<t_GtoB<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult BtoG "<<t_BtoG<<" us"<<std::endl;
std::cout << GridLogMessage<<"Coarse Mult tot "<<t_tot<<" us"<<std::endl;
*/
// std::cout << GridLogMessage<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flops "<< flops<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel flop/s "<< flops/t_mult<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse Kernel bytes/s "<< bytes/t_mult/1000<<" GB/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
NAMESPACE_END(Grid);

View File

@@ -1,238 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////
// Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
GRID_ASSERT(disp == -1 || disp == 0 || disp == 1);
GRID_ASSERT(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
/////////////////////////////////////////////////////////////////
// Less local equivalent of Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class NonLocalStencilGeometry {
public:
// int depth;
int skip;
int hops;
int npoint;
std::vector<Coordinate> shifts;
Coordinate stencil_size;
Coordinate stencil_lo;
Coordinate stencil_hi;
GridCartesian *grid;
GridCartesian *Grid() {return grid;};
int Depth(void){return 1;}; // Ghost zone depth
int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
int DimSkip(void){return skip;};
virtual ~NonLocalStencilGeometry() {};
int Reverse(int point)
{
int Nd = Grid()->Nd();
Coordinate shft = shifts[point];
Coordinate rev(Nd);
for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
for(int p=0;p<npoint;p++){
if(rev==shifts[p]){
return p;
}
}
GRID_ASSERT(0);
return -1;
}
void BuildShifts(void)
{
this->shifts.resize(0);
int Nd = this->grid->Nd();
int dd = this->DimSkip();
for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
Coordinate sft(Nd,0);
sft[dd+0] = s0;
sft[dd+1] = s1;
sft[dd+2] = s2;
sft[dd+3] = s3;
int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
if(nhops<=this->hops) this->shifts.push_back(sft);
}}}}
this->npoint = this->shifts.size();
std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
}
NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops,int _skip) : grid(_coarse_grid), hops(_hops), skip(_skip)
{
Coordinate latt = grid->GlobalDimensions();
stencil_size.resize(grid->Nd());
stencil_lo.resize(grid->Nd());
stencil_hi.resize(grid->Nd());
for(int d=0;d<grid->Nd();d++){
if ( latt[d] == 1 ) {
stencil_lo[d] = 0;
stencil_hi[d] = 0;
stencil_size[d]= 1;
} else if ( latt[d] == 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 0;
stencil_size[d]= 2;
} else if ( latt[d] > 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 1;
stencil_size[d]= 3;
}
}
this->BuildShifts();
};
};
// Need to worry about red-black now
class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
public:
virtual int DerivedDimSkip(void) { return 0;};
NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,0) { };
virtual ~NonLocalStencilGeometry4D() {};
};
class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
public:
virtual int DerivedDimSkip(void) { return 1; };
NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops,1) { };
virtual ~NonLocalStencilGeometry5D() {};
};
/*
* Bunch of different options classes
*/
class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,4)
{
};
};
class NextToNextToNextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,4)
{
};
};
class NextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,2)
{
};
};
class NextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,2)
{
};
};
class NearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,1)
{
};
};
class NearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,1)
{
};
};
NAMESPACE_END(Grid);

View File

@@ -1,34 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Grid/algorithms/multigrid/MultiGrid.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/algorithms/multigrid/Aggregates.h>
#include <Grid/algorithms/multigrid/Geometry.h>
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrixMultiRHS.h>

View File

@@ -54,10 +54,7 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) { assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -69,7 +66,7 @@ public:
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
@@ -103,10 +100,7 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) { assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -151,10 +145,7 @@ public:
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
if ( (_Tp*)ptr == (_Tp *) NULL ) { assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
}
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -174,48 +165,19 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview #ifdef ACCELERATOR_CSHIFT
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate // Cshift on device
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page template<class T> using cshiftAllocator = devAllocator<T>;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector #else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
/* template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> class vecView template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
{ template<class T> using commVector = std::vector<T,devAllocator<T> >;
protected: template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
T * data; template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
uint64_t size;
ViewMode mode;
void * cpu_ptr;
public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(Vector<T> &refer_to_me,ViewMode _mode)
{
cpu_ptr = &refer_to_me[0];
size = refer_to_me.size();
mode = _mode;
data =(T *) MemoryManager::ViewOpen(cpu_ptr,
size*sizeof(T),
mode,
AdviseDefault);
}
void ViewClose(void)
{ // Inform the manager
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
{
vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed
}
#define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
*/
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -16,44 +16,6 @@ NAMESPACE_BEGIN(Grid);
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
uint64_t total_host;; uint64_t total_host;;
#if defined(__has_feature)
#if __has_feature(leak_sanitizer)
#define ASAN_LEAK_CHECK
#endif
#endif
#ifdef ASAN_LEAK_CHECK
#include <sanitizer/asan_interface.h>
#include <sanitizer/common_interface_defs.h>
#include <sanitizer/lsan_interface.h>
#define LEAK_CHECK(A) { __lsan_do_recoverable_leak_check(); }
#else
#define LEAK_CHECK(A) { }
#endif
void MemoryManager::DisplayMallinfo(void)
{
#ifdef __linux__
struct mallinfo mi; // really want mallinfo2, but glibc version isn't uniform
mi = mallinfo();
std::cout << "MemoryManager: Total non-mmapped bytes (arena): "<< (size_t)mi.arena<<std::endl;
std::cout << "MemoryManager: # of free chunks (ordblks): "<< (size_t)mi.ordblks<<std::endl;
std::cout << "MemoryManager: # of free fastbin blocks (smblks): "<< (size_t)mi.smblks<<std::endl;
std::cout << "MemoryManager: # of mapped regions (hblks): "<< (size_t)mi.hblks<<std::endl;
std::cout << "MemoryManager: Bytes in mapped regions (hblkhd): "<< (size_t)mi.hblkhd<<std::endl;
std::cout << "MemoryManager: Max. total allocated space (usmblks): "<< (size_t)mi.usmblks<<std::endl;
std::cout << "MemoryManager: Free bytes held in fastbins (fsmblks): "<< (size_t)mi.fsmblks<<std::endl;
std::cout << "MemoryManager: Total allocated space (uordblks): "<< (size_t)mi.uordblks<<std::endl;
std::cout << "MemoryManager: Total free space (fordblks): "<< (size_t)mi.fordblks<<std::endl;
std::cout << "MemoryManager: Topmost releasable block (keepcost): "<< (size_t)mi.keepcost<<std::endl;
#endif
LEAK_CHECK();
}
void MemoryManager::PrintBytes(void) void MemoryManager::PrintBytes(void)
{ {
std::cout << " MemoryManager : ------------------------------------ "<<std::endl; std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
@@ -73,7 +35,7 @@ void MemoryManager::PrintBytes(void)
#ifdef GRID_CUDA #ifdef GRID_CUDA
cuda_mem(); cuda_mem();
#endif #endif
DisplayMallinfo();
} }
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; } uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
@@ -292,7 +254,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
GRID_ASSERT(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
if (ncache == 0) return ptr; if (ncache == 0) return ptr;
@@ -345,7 +307,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
GRID_ASSERT(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
for(int e=0;e<ncache;e++){ for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {

View File

@@ -209,10 +209,9 @@ private:
static void CpuViewClose(uint64_t Ptr); static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif #endif
static void NotifyDeletion(void * CpuPtr);
public: public:
static void DisplayMallinfo(void);
static void NotifyDeletion(void * CpuPtr);
static void Print(void); static void Print(void);
static void PrintAll(void); static void PrintAll(void);
static void PrintState( void* CpuPtr); static void PrintState( void* CpuPtr);

View File

@@ -1,15 +1,16 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UVM #ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define MAXLINE 512 #define MAXLINE 512
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
//#define dprintf(...) //#define dprintf(...)
//#define mprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
@@ -50,12 +51,12 @@ int MemoryManager::EntryPresent(uint64_t CpuPtr)
{ {
if(AccViewTable.empty()) return 0; if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); GRID_ASSERT((count==0)||(count==1)); auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count; return count;
} }
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
GRID_ASSERT(!EntryPresent(CpuPtr)); assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache; AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
@@ -69,9 +70,9 @@ void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,View
} }
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr) MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{ {
GRID_ASSERT(EntryPresent(CpuPtr)); assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr); auto AccCacheIterator = AccViewTable.find(CpuPtr);
GRID_ASSERT(AccCacheIterator!=AccViewTable.end()); assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator; return AccCacheIterator;
} }
void MemoryManager::EntryErase(uint64_t CpuPtr) void MemoryManager::EntryErase(uint64_t CpuPtr)
@@ -81,7 +82,7 @@ void MemoryManager::EntryErase(uint64_t CpuPtr)
} }
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache) void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.LRU_valid==0); assert(AccCache.LRU_valid==0);
if (AccCache.transient) { if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr); LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end(); AccCache.LRU_entry = --LRU.end();
@@ -94,7 +95,7 @@ void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache) void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.LRU_valid==1); assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry); LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0; AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes; DeviceLRUBytes-=AccCache.bytes;
@@ -108,19 +109,19 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
// Remove from Accelerator, remove entry, without flush // Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool. // Cannot be locked. If allocated Must be in LRU pool.
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
GRID_ASSERT(AccCache.state!=Empty); assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) { if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++; DeviceDestroy++;
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL; AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@@ -138,9 +139,9 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
// Take these OUT LRU queue when CPU locked? // Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important. // Cannot take out the table as cpuLock data is important.
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
GRID_ASSERT(AccCache.state!=Empty); assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld", mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return; if (AccCache.accLock!=0) return;
@@ -154,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL; AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
// uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++; DeviceEvictions++;
@@ -162,30 +163,28 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::Flush(AcceleratorViewEntry &AccCache) void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.state==AccDirty); assert(AccCache.state==AccDirty);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.AccPtr!=(uint64_t)NULL);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
} }
void MemoryManager::Clone(AcceleratorViewEntry &AccCache) void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.state==CpuDirty); assert(AccCache.state==CpuDirty);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx", mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@@ -194,10 +193,10 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.state!=Empty); assert(AccCache.state!=Empty);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
@@ -211,36 +210,33 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode) void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{ {
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr); dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr); CpuViewClose((uint64_t)Ptr);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
uint64_t CpuPtr = (uint64_t)_CpuPtr; uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr); dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else { } else {
GRID_ASSERT(0); assert(0);
return NULL; return NULL;
} }
} }
void MemoryManager::EvictVictims(uint64_t bytes) void MemoryManager::EvictVictims(uint64_t bytes)
{ {
if(bytes>=DeviceMaxBytes) { assert(bytes<DeviceMaxBytes);
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
GRID_ASSERT(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){ while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){ if ( DeviceLRUBytes > 0){
GRID_ASSERT(LRU.size()>0); assert(LRU.size()>0);
uint64_t victim = LRU.back(); // From the LRU uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim); auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
@@ -264,19 +260,19 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
if (!AccCache.AccPtr) { if (!AccCache.AccPtr) {
EvictVictims(bytes); EvictVictims(bytes);
} }
GRID_ASSERT((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
GRID_ASSERT(AccCache.cpuLock==0); // Programming error assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld", dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,
(uint64_t)bytes, (uint64_t)bytes,
(uint64_t)AccCache.accLock); (uint64_t)AccCache.accLock);
GRID_ASSERT(AccCache.CpuPtr == CpuPtr); assert(AccCache.CpuPtr == CpuPtr);
GRID_ASSERT(AccCache.bytes ==bytes); assert(AccCache.bytes ==bytes);
} }
/* /*
* State transitions and actions * State transitions and actions
@@ -293,7 +289,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
* AccWrite AccDirty AccDirty - - * AccWrite AccDirty AccDirty - -
*/ */
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
GRID_ASSERT(AccCache.LRU_valid==0); assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes; AccCache.bytes = bytes;
@@ -309,7 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent AccCache.state = Consistent; // Empty + AccRead => Consistent
} }
AccCache.accLock= 1; AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock); dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){ } else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) { if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache); CpuDiscard(AccCache);
@@ -322,30 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock); dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock); dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock); dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
GRID_ASSERT(AccCache.accLock>0); assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU // If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){ if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU // must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU "); dprintf("AccCache entry removed from LRU \n");
LRUremove(AccCache); LRUremove(AccCache);
} }
@@ -362,16 +358,16 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock>0); assert(AccCache.accLock>0);
AccCache.accLock--; AccCache.accLock--;
// Move to LRU queue if not locked and close on device // Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) { if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache); LRUinsert(AccCache);
} else { } else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
} }
} }
void MemoryManager::CpuViewClose(uint64_t CpuPtr) void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -379,8 +375,8 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
GRID_ASSERT(AccCache.cpuLock>0); assert(AccCache.cpuLock>0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
AccCache.cpuLock--; AccCache.cpuLock--;
} }
@@ -413,12 +409,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
// EvictVictims(bytes); // EvictVictims(bytes);
// } // }
GRID_ASSERT((mode==CpuRead)||(mode==CpuWrite)); assert((mode==CpuRead)||(mode==CpuWrite));
GRID_ASSERT(AccCache.accLock==0); // Programming error assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
GRID_ASSERT(AccCache.CpuPtr == CpuPtr); assert(AccCache.CpuPtr == CpuPtr);
GRID_ASSERT(AccCache.bytes==bytes); assert(AccCache.bytes==bytes);
} }
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
@@ -433,20 +429,20 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL); assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite) if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL); assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache); Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++; AccCache.cpuLock++;
} else { } else {
GRID_ASSERT(0); // should be unreachable assert(0); // should be unreachable
} }
AccCache.transient= transient? EvictNext : 0; AccCache.transient= transient? EvictNext : 0;
@@ -478,7 +474,6 @@ void MemoryManager::Print(void)
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl; std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl; std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl; std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
acceleratorMem();
std::cout << GridLogMessage << "--------------------------------------------" << std::endl; std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
} }
void MemoryManager::PrintAll(void) void MemoryManager::PrintAll(void)
@@ -528,12 +523,12 @@ void MemoryManager::Audit(std::string s)
std::cout << " Memory Manager::Audit() from "<<s<<std::endl; std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){ for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it; uint64_t cpuPtr = *it;
GRID_ASSERT(EntryPresent(cpuPtr)); assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr); auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes; LruBytes2+=AccCache.bytes;
GRID_ASSERT(AccCache.LRU_valid==1); assert(AccCache.LRU_valid==1);
GRID_ASSERT(AccCache.LRU_entry==it); assert(AccCache.LRU_entry==it);
} }
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl; std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
@@ -552,7 +547,7 @@ void MemoryManager::Audit(std::string s)
if( AccCache.LRU_valid ) LruCnt++; if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) { if ( AccCache.cpuLock || AccCache.accLock ) {
GRID_ASSERT(AccCache.LRU_valid==0); assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
@@ -561,16 +556,16 @@ void MemoryManager::Audit(std::string s)
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl; << "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
} }
GRID_ASSERT( AccCache.cpuLock== 0 ) ; assert( AccCache.cpuLock== 0 ) ;
GRID_ASSERT( AccCache.accLock== 0 ) ; assert( AccCache.accLock== 0 ) ;
} }
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl; std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
GRID_ASSERT(LruBytes1==LruBytes2); assert(LruBytes1==LruBytes2);
GRID_ASSERT(LruBytes1==DeviceLRUBytes); assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
GRID_ASSERT(AccBytes==DeviceBytes); assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
GRID_ASSERT(LruCnt == LRU.size()); assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl; std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
} }

View File

@@ -10,16 +10,16 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY); int fd = open("/proc/self/pagemap", O_RDONLY);
GRID_ASSERT(fd >= 0); assert(fd >= 0);
const int page_size = 4096; const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size; uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn; off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size; uint64_t npages = (BYTES + page_size-1) / page_size;
std::vector<uint64_t> pagedata(npages); uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET); uint64_t ret = lseek(fd, offset, SEEK_SET);
GRID_ASSERT(ret == offset); assert(ret == offset);
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages); ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
GRID_ASSERT(ret == sizeof(uint64_t) * npages); assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512; int nhugepages = npages / 512;
int n4ktotal, nnothuge; int n4ktotal, nnothuge;
n4ktotal = 0; n4ktotal = 0;

View File

@@ -70,8 +70,8 @@ public:
Coordinate _istride; // Inner stride i.e. within simd lane Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions). int _osites; // _isites*_osites = product(dimensions).
int _isites; int _isites;
int64_t _fsites; // _isites*_osites = product(dimensions). int _fsites; // _isites*_osites = product(dimensions).
int64_t _gsites; int _gsites;
Coordinate _slice_block;// subslice information Coordinate _slice_block;// subslice information
Coordinate _slice_stride; Coordinate _slice_stride;
Coordinate _slice_nblock; Coordinate _slice_nblock;
@@ -82,7 +82,6 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim;
public: public:
@@ -165,7 +164,7 @@ public:
// //
if ( _simd_layout[dimension] > 2 ) { if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if ( d != dimension ) GRID_ASSERT ( (_simd_layout[d]==1) ); if ( d != dimension ) assert ( (_simd_layout[d]==1) );
} }
permute_type = RotateBit; // How to specify distance; this is not just direction. permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type; return permute_type;
@@ -184,10 +183,10 @@ public:
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; }; inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; }; inline int lSites(void) const { return _isites*_osites; };
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;}; inline int Nd (void) const { return _ndimension;};
inline const Coordinate &LocalStarts(void) { return _lstart; }; inline const Coordinate LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;}; inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;}; inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;}; inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
@@ -215,15 +214,15 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Global addressing // Global addressing
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){ void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
GRID_ASSERT(gidx< gSites()); assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){ void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
GRID_ASSERT(lidx<lSites()); assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
} }
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){ void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
gidx=0; gidx=0;
int mult=1; int mult=1;
for(int mu=0;mu<_ndimension;mu++) { for(int mu=0;mu<_ndimension;mu++) {

View File

@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
// Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -106,7 +106,6 @@ public:
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);; _checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@@ -128,10 +127,10 @@ public:
// Use a reduced simd grid // Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl; //std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]); assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;

View File

@@ -57,17 +57,16 @@ class GridRedBlackCartesian : public GridBase
{ {
public: public:
// Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
// int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
virtual int isCheckerBoarded(void) const { return 1; };
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(const Coordinate &site){ virtual int CheckerBoard(const Coordinate &site){
int linear=0; int linear=0;
GRID_ASSERT(site.size()==_ndimension); assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d]) if(_checker_dim_mask[d])
linear=linear+site[d]; linear=linear+site[d];
@@ -160,11 +159,11 @@ public:
_isCheckerBoarded = true; _isCheckerBoarded = true;
_checker_dim = checker_dim; _checker_dim = checker_dim;
GRID_ASSERT(checker_dim_mask[checker_dim] == 1); assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size(); _ndimension = dimensions.size();
GRID_ASSERT(checker_dim_mask.size() == _ndimension); assert(checker_dim_mask.size() == _ndimension);
GRID_ASSERT(processor_grid.size() == _ndimension); assert(processor_grid.size() == _ndimension);
GRID_ASSERT(simd_layout.size() == _ndimension); assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
@@ -190,20 +189,20 @@ public:
if (d == _checker_dim) if (d == _checker_dim)
{ {
GRID_ASSERT((_gdimensions[d] & 0x1) == 0); assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2; _gsites /= 2;
} }
_ldimensions[d] = _gdimensions[d] / _processors[d]; _ldimensions[d] = _gdimensions[d] / _processors[d];
GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]); assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid // Use a reduced simd grid
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
GRID_ASSERT(_rdimensions[d] > 0); assert(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard. // all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d // If Ls vectorised, this must still be the case; e.g. dwf rb5d

View File

@@ -57,29 +57,18 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
GlobalSumVector((float *)&c,2); GlobalSumVector((float *)&c,2);
} }
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{ {
GlobalSumVector((float *)c,2*N); GlobalSumVector((float *)c,2*N);
} }
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{ {
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);

View File

@@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ; extern bool Stencil_force_mpi ;
@@ -108,7 +106,7 @@ public:
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ; static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, uint64_t bytes); static void BroadcastWorld(int root,void* data, int bytes);
static void BarrierWorld(void); static void BarrierWorld(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@@ -130,35 +128,6 @@ public:
void GlobalXOR(uint32_t &); void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &); void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering GRID_ASSERT in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
@@ -169,44 +138,24 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
uint64_t bytes,int dir);
void SendToRecvFrom(void *xmit, void SendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
uint64_t bytes); int bytes);
int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
uint64_t bytes,int dir); int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
uint64_t xbytes,uint64_t rbytes,int dir); int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int xmit_to_rank,int do_xmit,
void *recv,void *recv_comp,
int recv_from_rank,int do_recv,
uint64_t xbytes,uint64_t rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
@@ -220,20 +169,20 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger // Broadcast a buffer and composite larger
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, uint64_t bytes); void Broadcast(int root,void* data, int bytes);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// All2All down one dimension // All2All down one dimension
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){ template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
GRID_ASSERT(dim>=0); assert(dim>=0);
GRID_ASSERT(dim<_ndimension); assert(dim<_ndimension);
GRID_ASSERT(in.size()==out.size()); assert(in.size()==out.size());
int numnode = _processors[dim]; int numnode = _processors[dim];
uint64_t bytes=sizeof(T); uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode; uint64_t words=in.size()/numnode;
GRID_ASSERT(numnode * words == in.size()); assert(numnode * words == in.size());
GRID_ASSERT(words < (1ULL<<31)); assert(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes); AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
} }
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes); void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);

View File

@@ -28,17 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
void GridAbort(void) { MPI_Abort(MPI_COMM_WORLD,SIGABRT); }
extern void * Grid_backtrace_buffer[_NBACKTRACE];
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
#ifdef GRID_CHECKSUM_COMMS
uint64_t checksum_index = 1;
#endif
//////////////////////////////////////////// ////////////////////////////////////////////
// First initialise of comms system // First initialise of comms system
@@ -63,11 +55,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
#endif #endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
GRID_ASSERT(0); assert(0);
} }
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -88,20 +80,20 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{ {
int rank; int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
GRID_ASSERT(ierr==0); assert(ierr==0);
return rank; return rank;
} }
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{ {
coor.resize(_ndimension); coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -128,8 +120,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); GRID_ASSERT(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; GRID_ASSERT(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1); Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1); Coordinate shm_processors (_ndimension,1);
@@ -153,7 +145,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
childsize *= processors[d]; childsize *= processors[d];
} }
int Nchild = Nparent/childsize; int Nchild = Nparent/childsize;
GRID_ASSERT (childsize * Nchild == Nparent); assert (childsize * Nchild == Nparent);
Coordinate ccoor(_ndimension); // coor within subcommunicator Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent Coordinate scoor(_ndimension); // coor of split within parent
@@ -179,12 +171,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
// Split the communicator // Split the communicator
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split); int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
GRID_ASSERT(ierr==0); assert(ierr==0);
} else { } else {
srank = 0; srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split); int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -209,7 +201,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
} }
} }
for(int d=0;d<processors.size();d++){ for(int d=0;d<processors.size();d++){
GRID_ASSERT(_processor_coor[d] == ccoor[d] ); assert(_processor_coor[d] == ccoor[d] );
} }
} }
@@ -251,7 +243,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
for(int i=0;i<_ndimension*2;i++){ for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]); MPI_Comm_dup(communicator,&communicator_halo[i]);
} }
GRID_ASSERT(Size==_Nprocessors); assert(Size==_Nprocessors);
} }
CartesianCommunicator::~CartesianCommunicator() CartesianCommunicator::~CartesianCommunicator()
@@ -265,176 +257,104 @@ CartesianCommunicator::~CartesianCommunicator()
} }
} }
} }
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(uint64_t &u){ void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){ void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalXOR(uint32_t &u){ void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalXOR(uint64_t &u){ void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(float &f) void CartesianCommunicator::GlobalMax(float &f)
{ {
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(double &d) void CartesianCommunicator::GlobalMax(double &d)
{ {
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(float *f,int N) void CartesianCommunicator::GlobalSumVector(float *f,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
uint64_t bytes,int dir)
{
MPI_Request xrq;
MPI_Request rrq;
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
int tag;
tag= dir+from*32;
int ierr=MPI_Irecv(recv,(int)( bytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator,&rrq);
GRID_ASSERT(ierr==0);
list.push_back(rrq);
tag= dir+_processor*32;
ierr =MPI_Isend(xmit,(int)(bytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator,&xrq);
GRID_ASSERT(ierr==0);
list.push_back(xrq);
}
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
GRID_ASSERT(ierr==0);
list.resize(0);
}
// Basic Halo comms primitive // Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
uint64_t bytes) int bytes)
{ {
std::vector<MpiCommsRequest_t> reqs(0); std::vector<CommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
// Enforce no UVM in comms, device or host OK // Enforce no UVM in comms, device or host OK
GRID_ASSERT(acceleratorIsCommunicable(xmit)); assert(acceleratorIsCommunicable(xmit));
GRID_ASSERT(acceleratorIsCommunicable(recv)); assert(acceleratorIsCommunicable(recv));
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,dest,myrank, ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,(int)(bytes/sizeof(int32_t)),MPI_INT32_T,from, from, recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
GRID_ASSERT(ierr==0); assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
} }
// Basic Halo comms primitive // Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dox, int dest, int dox,
void *recv, void *recv,
int from, int dor, int from, int dor,
uint64_t bytes,int dir) int bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
int CartesianCommunicator::IsOffNode(int rank)
{
int grank = ShmRanks[rank];
if ( grank == MPI_UNDEFINED ) return true;
else return false;
}
#ifdef ACCELERATOR_AWARE_MPI #undef NVLINK_GET // Define to use get instead of put DMA
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {}; double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,int dox,
void *recv, void *recv,
int from,int dor, int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir) int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@@ -447,431 +367,62 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int gfrom = ShmRanks[from]; int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor]; int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor); assert(dest != _processor);
GRID_ASSERT(from != _processor); assert(from != _processor);
GRID_ASSERT(gme == ShmRank); assert(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag; int tag;
if ( dor ) { if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl; ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
ierr=MPI_Irecv(recv_comp,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq); assert(ierr==0);
GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
} }
#ifdef NVLINK_GET #ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit); void *shm = (void *) this->ShmBufferTranslate(from,xmit);
GRID_ASSERT(shm!=NULL); assert(shm!=NULL);
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif #endif
} }
// This is a NVLINK PUT
if (dox) { if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit_comp,(int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=xbytes; off_node_bytes+=xbytes;
} else { } else {
#ifndef NVLINK_GET #ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
GRID_ASSERT(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif #endif
} }
} }
return off_node_bytes;
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
int nreq=list.size(); int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise(); acceleratorCopySynchronise();
if (nreq==0) return; if (nreq==0) return;
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
GRID_ASSERT(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
#ifdef GRID_CHECKSUM_COMMS
rbytes += 8;
xbytes += 8;
#endif
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv,(int)(rbytes/sizeof(int32_t)), MPI_INT32_T,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
#ifdef GRID_CHECKSUM_COMMS
uint64_t xbytes_data = xbytes - 8;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
GRID_ASSERT(xbytes % 8 == 0);
// flip one bit so that a zero buffer is not consistent
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
#else
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
#endif
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// GRID_ASSERT(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0); assert(ierr==0);
list.resize(0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
#ifdef GRID_CHECKSUM_COMMS
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
#else
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
#endif
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
} }
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint64_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, (int)(xbytes/sizeof(int32_t)), MPI_INT32_T,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
uint64_t xbytes,uint64_t rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
GRID_ASSERT(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
uint64_t rbytes_data = list[r].bytes - 8;
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
if (expected_cs != computed_cs) {
// TODO: error message, backtrace, quit
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
fprintf(stderr, " processor = %d\n", (int)_processor);
for(int d=0;d<_processors.size();d++)
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
fprintf(stderr, " hostname: %s\n", GridHostname());
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
fprintf(stderr, " dest: %d\n", list[r].dest);
fprintf(stderr, " tag: %d\n", list[r].tag);
fprintf(stderr, " commdir: %d\n", list[r].commdir);
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
fflush(stderr);
// backtrace
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
exit(1);
}
}
}
checksum_index += 1;
#endif
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);
} }
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -879,19 +430,17 @@ void CartesianCommunicator::StencilBarrier(void)
//} //}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::Broadcast(int root,void* data,uint64_t bytes) void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data, int ierr=MPI_Bcast(data,
(int)bytes, bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator); communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankWorld(void){ int CartesianCommunicator::RankWorld(void){
int r; int r;
@@ -899,25 +448,23 @@ int CartesianCommunicator::RankWorld(void){
return r; return r;
} }
void CartesianCommunicator::BarrierWorld(void){ void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world); int ierr = MPI_Barrier(communicator_world);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
(int)bytes, bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator_world); communicator_world);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
Coordinate row(_ndimension,1); Coordinate row(_ndimension,1);
GRID_ASSERT(dim>=0 && dim<_ndimension); assert(dim>=0 && dim<_ndimension);
// Split the communicator // Split the communicator
row[dim] = _processors[dim]; row[dim] = _processors[dim];
@@ -928,7 +475,6 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
} }
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{ {
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments // MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data. // 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
@@ -938,8 +484,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int ibytes; int ibytes;
iwords = words; iwords = words;
ibytes = bytes; ibytes = bytes;
GRID_ASSERT(words == iwords); // safe to cast to int ? assert(words == iwords); // safe to cast to int ?
GRID_ASSERT(bytes == ibytes); // safe to cast to int ? assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object); MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object); MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);

View File

@@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
void GridAbort(void) { abort(); }
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::Init(communicator_world);
@@ -56,14 +54,14 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
_shm_processors = Coordinate(processors.size(),1); _shm_processors = Coordinate(processors.size(),1);
_processors = processors; _processors = processors;
_ndimension = processors.size(); GRID_ASSERT(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake // Require 1^N processor grid for fake
_Nprocessors=1; _Nprocessors=1;
_processor = 0; _processor = 0;
for(int d=0;d<_ndimension;d++) { for(int d=0;d<_ndimension;d++) {
GRID_ASSERT(_processors[d]==1); assert(_processors[d]==1);
_processor_coor[d] = 0; _processor_coor[d] = 0;
} }
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
@@ -89,21 +87,10 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
void *recv, void *recv,
int from, int from,
uint64_t bytes) int bytes)
{ {
GRID_ASSERT(0); assert(0);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ GRID_ASSERT(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
uint64_t bytes,int dir)
{
GRID_ASSERT(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@@ -115,8 +102,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int CartesianCommunicator::RankWorld(void){return 0;} int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, uint64_t bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, uint64_t bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
void CartesianCommunicator::BarrierWorld(void) { } void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
@@ -126,33 +113,20 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
dest=0; dest=0;
} }
int CartesianCommunicator::IsOffNode(int rank) { return false; }
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
uint64_t bytes, int dir) int bytes, int dir)
{ {
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {}; double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,int dor,
uint64_t xbytes,uint64_t rbytes, int dir) int xbytes,int rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit_comp,
int xmit_to_rank,int dox,
void *recv, void *recv_comp,
int recv_from_rank,int dor,
uint64_t xbytes,uint64_t rbytes, int dir)
{ {
return xbytes+rbytes; return xbytes+rbytes;
} }

View File

@@ -58,8 +58,8 @@ int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void) void GlobalSharedMemory::SharedMemoryFree(void)
{ {
GRID_ASSERT(_ShmAlloc); assert(_ShmAlloc);
GRID_ASSERT(_ShmAllocBytes>0); assert(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes); munmap(WorldShmCommBufs[r],_ShmAllocBytes);
} }
@@ -80,7 +80,7 @@ void *SharedMemory::HostBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
GRID_ASSERT(host_heap_bytes<host_heap_size); assert(host_heap_bytes<host_heap_size);
} }
return ptr; return ptr;
} }
@@ -100,7 +100,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
GRID_ASSERT(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr; return ptr;
@@ -127,13 +127,13 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
if ( str ) { if ( str ) {
std::vector<int> IntShmDims; std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims); GridCmdOptionIntVector(std::string(str),IntShmDims);
GRID_ASSERT(IntShmDims.size() == WorldDims.size()); assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1; long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) { for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]); ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
GRID_ASSERT(divides(ShmDims[dim],WorldDims[dim])); assert(divides(ShmDims[dim],WorldDims[dim]));
} }
GRID_ASSERT(ShmSize == WorldShmSize); assert(ShmSize == WorldShmSize);
return; return;
} }

View File

@@ -46,40 +46,8 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif
@@ -137,7 +105,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes); static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@@ -42,7 +42,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else
#endif #endif
#include <syscall.h> #include <syscall.h>
#endif #endif
@@ -67,7 +66,7 @@ public:
{ {
int errnum; int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); GRID_ASSERT(sock>0); sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 }; struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX; sa_un.sun_family = AF_UNIX;
@@ -158,7 +157,7 @@ public:
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
GRID_ASSERT(_ShmSetup==0); assert(_ShmSetup==0);
WorldComm = comm; WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank); MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize); MPI_Comm_size(WorldComm,&WorldSize);
@@ -184,7 +183,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// WorldNodes // WorldNodes
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
GRID_ASSERT( (WorldNodes * WorldShmSize) == WorldSize ); assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
@@ -209,7 +208,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MyGroup.resize(WorldShmSize); MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){ for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){ if(WorldShmRanks[rank]!=MPI_UNDEFINED){
GRID_ASSERT(g<WorldShmSize); assert(g<WorldShmSize);
MyGroup[g++] = rank; MyGroup[g++] = rank;
} }
} }
@@ -225,7 +224,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// global sum leaders over comm world // global sum leaders over comm world
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm); int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
GRID_ASSERT(ierr==0); assert(ierr==0);
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// find the group leaders world rank // find the group leaders world rank
@@ -246,7 +245,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNode=g; WorldNode=g;
} }
} }
GRID_ASSERT(WorldNode!=-1); assert(WorldNode!=-1);
_ShmSetup=1; _ShmSetup=1;
} }
// Gray encode support // Gray encode support
@@ -288,7 +287,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Assert power of two shm_size. // Assert power of two shm_size.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
GRID_ASSERT(log2size != -1); assert(log2size != -1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname // Identify the hypercube coordinate of this node using hostname
@@ -309,7 +308,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Parse ICE-XA hostname to get hypercube location // Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen); gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
GRID_ASSERT(nscan==3); assert(nscan==3);
int nlo = N%9; int nlo = N%9;
int nhi = N/9; int nhi = N/9;
@@ -333,8 +332,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor; hypercoor=hypercoor-rootcoor;
GRID_ASSERT(hypercoor<WorldSize); assert(hypercoor<WorldSize);
GRID_ASSERT(hypercoor>=0); assert(hypercoor>=0);
////////////////////////////////////// //////////////////////////////////////
// Printing // Printing
@@ -382,7 +381,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
GRID_ASSERT(WorldSize==Nprocessors); assert(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -401,7 +400,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{ {
@@ -431,8 +430,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
// std::cerr << " WorldSize "<<WorldSize << " Nprocessors "<<Nprocessors<<" "<<processors<<std::endl; assert(WorldSize==Nprocessors);
GRID_ASSERT(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -448,7 +446,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET // SHMGET
@@ -457,8 +455,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
@@ -519,8 +517,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group // allocate the pointer array for shared windows for our group
@@ -539,8 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n"); HostCommBuf= malloc(bytes);
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
@@ -548,13 +545,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << Mheader " acceleratorAllocDevice "<< bytes std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
if ( WorldRank == 0 ){ std::cout<< "Setting up IPC"<<std::endl;
std::cout<< Mheader "Setting up IPC"<<std::endl;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -574,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;
@@ -585,6 +580,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
} }
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid(); handle.pid = getpid();
@@ -629,7 +626,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
MPI_BYTE, MPI_BYTE,
r, r,
WorldShmComm); WorldShmComm);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@@ -643,12 +640,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef SHM_SOCKETS #ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor(); myfd=UnixSockets::RecvFileDescriptor();
#else #else
// std::cout<<"mapping seeking remote pid/fd " std::cout<<"mapping seeking remote pid/fd "
// <<handle.pid<<"/" <<handle.pid<<"/"
// <<handle.fd<<std::endl; <<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0); int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); // int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0); myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno; int err_t = errno;
@@ -658,7 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(0); assert(0);
} }
#endif #endif
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -667,8 +664,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
} }
GRID_ASSERT(thisBuf!=nullptr); assert(thisBuf!=nullptr);
} }
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
@@ -709,8 +709,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -740,14 +740,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); GRID_ASSERT(0); perror("failed mmap"); assert(0);
} }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
}; };
@@ -757,8 +756,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -769,7 +768,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbf and others map filesystems as mappable huge pages // Hugetlbf and others map filesystems as mappable huge pages
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX]; char shm_name [NAME_MAX];
GRID_ASSERT(WorldShmSize == 1); assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
int fd=-1; int fd=-1;
@@ -783,9 +782,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); GRID_ASSERT(0); perror("failed mmap"); assert(0);
} }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -804,8 +803,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize); WorldShmCommBufs.resize(WorldShmSize);
@@ -836,7 +835,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
} }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -857,8 +856,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( fd<0 ) { perror("failed shm_open"); assert(0); } if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); GRID_ASSERT(0); } if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -881,14 +880,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{ {
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes); acceleratorCopyToDevice(src,dest,bytes);
//#else #else
// bcopy(src,dest,bytes); bcopy(src,dest,bytes);
//#endif #endif
//} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@@ -915,7 +914,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer // Map ShmRank to WorldShmRank and use the right buffer
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
GRID_ASSERT (GlobalSharedMemory::ShmAlloc()==1); assert (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes(); heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
@@ -924,7 +923,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@@ -977,18 +975,19 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode; check[0]=GlobalSharedMemory::WorldNode;
check[1]=r; check[1]=r;
check[2]=magic; check[2]=magic;
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t)); GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
GRID_ASSERT(check[0]==GlobalSharedMemory::WorldNode);
GRID_ASSERT(check[1]==r);
GRID_ASSERT(check[2]==magic);
}
ShmBarrier(); ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl; GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)
@@ -1003,14 +1002,12 @@ void *SharedMemory::ShmBuffer(int rank)
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
GRID_ASSERT(gpeer!=ShmRank); // never send to self assert(gpeer!=ShmRank); // never send to self
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
return NULL; return NULL;
} else { } else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
return (void *) remote; return (void *) remote;
} }
} }

View File

@@ -34,7 +34,7 @@ NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
GRID_ASSERT(_ShmSetup==0); assert(_ShmSetup==0);
WorldComm = 0; WorldComm = 0;
WorldRank = 0; WorldRank = 0;
WorldSize = 1; WorldSize = 1;
@@ -62,8 +62,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ; void * ShmCommBuf ;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
@@ -92,8 +92,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
int mmap_flag =0; int mmap_flag =0;
#ifdef MAP_ANONYMOUS #ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS; mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
@@ -122,17 +122,17 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
acceleratorMemSet(dest,0,bytes); acceleratorMemSet(dest,0,bytes);
} }
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{ {
// acceleratorCopyToDevice(src,dest,bytes); acceleratorCopyToDevice(src,dest,bytes);
//} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{ {
GRID_ASSERT(GlobalSharedMemory::ShmAlloc()==1); assert(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1); ShmRanks.resize(1);
ShmCommBufs.resize(1); ShmCommBufs.resize(1);
ShmRanks[0] = 0; ShmRanks[0] = 0;

View File

@@ -51,6 +51,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{ {

View File

@@ -30,11 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern std::vector<std::pair<int,int> > Cshift_table; extern std::vector<std::pair<int,int> > Cshift_table;
extern deviceVector<std::pair<int,int> > Cshift_table_device; extern commVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void) inline std::pair<int,int> *MapCshiftTable(void)
{ {
// GPU version // GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size(); uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) { if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz); Cshift_table_device.resize(sz);
@@ -44,13 +45,16 @@ inline std::pair<int,int> *MapCshiftTable(void)
sizeof(Cshift_table[0])*sz); sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0]; return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map // CPU version use identify map
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -90,10 +94,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -118,6 +129,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -128,10 +140,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask; Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -152,13 +175,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -202,10 +245,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorWriteDiscard); #ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
} }
} }
@@ -228,7 +278,8 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
autoView( rhs_v , rhs, AcceleratorWriteDiscard); #ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
int b = nn/e1; int b = nn/e1;
@@ -236,13 +287,21 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block; int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << "Scatter_plane merge GRID_ASSERT(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
GRID_ASSERT(0); // This will fail if hit on GPU assert(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite); autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@@ -301,11 +360,19 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWriteDiscard); autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -345,11 +412,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead); autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite); autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
} }
} }

View File

@@ -29,13 +29,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_CSHIFT_MPI_H_ #ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifdef GRID_CHECKSUM_COMMS
extern uint64_t checksum_index;
#endif
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -49,20 +45,6 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
// Map to always positive shift modulo global full dimension. // Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd; shift = (shift+fd)%fd;
if( shift ==0 ) {
ret = rhs;
return ret;
}
//
// Potential easy fast cases:
// Shift is a multiple of the local lattice extent.
// Then need only to shift whole subvolumes
int L = rhs.Grid()->_ldimensions[dimension];
if ( (shift%L )==0 && !rhs.Grid()->CheckerBoarded(dimension) ) {
Cshift_simple(ret,rhs,dimension,shift);
return ret;
}
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension); ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type // the permute type
@@ -83,59 +65,10 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
Cshift_comms(ret,rhs,dimension,shift); Cshift_comms(ret,rhs,dimension,shift);
} }
t1=usecond(); t1=usecond();
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; // std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret; return ret;
} }
template<class vobj> void Cshift_simple(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
GridBase *grid=rhs.Grid();
int comm_proc, xmit_to_rank, recv_from_rank;
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int ld = rhs.Grid()->_ldimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
comm_proc = ((shift)/ld)%pd;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
if(comm_dim) {
int64_t bytes = sizeof(vobj) * grid->oSites();
autoView(rhs_v , rhs, AcceleratorRead);
autoView(ret_v , ret, AcceleratorWrite);
void *send_buf = (void *)&rhs_v[0];
void *recv_buf = (void *)&ret_v[0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom(send_buf,
xmit_to_rank,
recv_buf,
recv_from_rank,
bytes);
#else
static hostVector<vobj> hrhs; hrhs.resize(grid->oSites());
static hostVector<vobj> hret; hret.resize(grid->oSites());
void *hsend_buf = (void *)&hrhs[0];
void *hrecv_buf = (void *)&hret[0];
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom(hsend_buf,
xmit_to_rank,
hrecv_buf,
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
}
}
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{ {
int sshift[2]; int sshift[2];
@@ -161,7 +94,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
//std::cout << "Single pass Cshift_comms" <<std::endl; //std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3); Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@@ -171,6 +104,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -184,19 +119,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int pd = rhs.Grid()->_processors[dimension]; int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
GRID_ASSERT(simd_layout==1); assert(simd_layout==1);
GRID_ASSERT(comm_dim==1); assert(comm_dim==1);
GRID_ASSERT(shift>=0); assert(shift>=0);
GRID_ASSERT(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size); static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size); static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
#endif
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -211,11 +141,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int comm_proc = ((x+sshift)/rd)%pd; int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) { if (comm_proc==0) {
FlightRecorder::StepLog("Cshift_Copy_plane");
tcopy-=usecond(); tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask); Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond(); tcopy+=usecond();
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
} else { } else {
int words = buffer_size; int words = buffer_size;
@@ -223,84 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
FlightRecorder::StepLog("Cshift_Gather_plane");
tgather-=usecond(); tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond(); tgather+=usecond();
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
// int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
grid->Barrier(); // grid->Barrier();
FlightRecorder::StepLog("Cshift_SendRecv");
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
GRID_ASSERT(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
GRID_ASSERT(expected_cs == computed_cs);
#else
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
xbytes+=bytes; xbytes+=bytes;
grid->Barrier(); // grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
FlightRecorder::StepLog("Cshift_barrier_complete");
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
} }
if (Cshift_verbose){ /*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
} */
} }
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -322,10 +205,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout // << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; // << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
GRID_ASSERT(comm_dim==1); assert(comm_dim==1);
GRID_ASSERT(simd_layout==2); assert(simd_layout==2);
GRID_ASSERT(shift>=0); assert(shift>=0);
GRID_ASSERT(shift<fd); assert(shift<fd);
RealD tcopy=0.0; RealD tcopy=0.0;
RealD tgather=0.0; RealD tgather=0.0;
@@ -341,8 +224,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd); static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi; scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi; scalar_object * send_buf_extract_mpi;
@@ -350,18 +233,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract[s].resize(buffer_size); send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size); recv_buf_extract[s].resize(buffer_size);
} }
#ifndef ACCELERATOR_AWARE_MPI
#ifdef GRID_CHECKSUM_COMMS
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#ifdef GRID_CHECKSUM_COMMS
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
#endif
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -404,60 +275,24 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if (nbr_ic) nbr_lane|=inner_bit; if (nbr_ic) nbr_lane|=inner_bit;
GRID_ASSERT (sx == nbr_ox); assert (sx == nbr_ox);
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
grid->Barrier(); // grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi, grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
assert(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift_comms_simd: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
assert(expected_cs == computed_cs);
#else
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
#endif
xbytes+=bytes; xbytes+=bytes;
grid->Barrier(); // grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
@@ -470,15 +305,242 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
if(Cshift_verbose){ /*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
} }
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
} }
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
// grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
*/
}
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -1,5 +1,5 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
std::vector<std::pair<int,int> > Cshift_table; std::vector<std::pair<int,int> > Cshift_table;
deviceVector<std::pair<int,int> > Cshift_table_device; commVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -245,7 +245,7 @@ template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * =
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
GRID_ASSERT(cb == lat.Checkerboard()); assert(cb == lat.Checkerboard());
} }
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }

View File

@@ -257,30 +257,17 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
}); });
} }
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm"); GRID_TRACE("axpy_norm");
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm"); GRID_TRACE("axpby_norm");
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
} }
/// Trace product /// Trace product

View File

@@ -120,12 +120,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
GRID_ASSERT(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -144,12 +144,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
GRID_ASSERT(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -168,12 +168,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
GRID_ASSERT(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
@@ -191,11 +191,11 @@ public:
Lattice(const LatticeUnaryExpression<Op,T1> & expr) { Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
GRID_ASSERT(this->_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -206,11 +206,11 @@ public:
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
GRID_ASSERT(this->_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -221,11 +221,11 @@ public:
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
GRID_ASSERT(this->_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -234,23 +234,10 @@ public:
} }
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp;
vtmp = r;
#if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(*vvtmp_p);
coalescedWrite(me[ss],stmp);
});
#else
auto me = View(CpuWrite); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss]= r; me[ss]= r;
}); });
#endif
me.ViewClose(); me.ViewClose();
return *this; return *this;
} }
@@ -264,7 +251,7 @@ public:
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
GRID_ASSERT((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode); SetViewMode(mode);
} }
@@ -373,7 +360,7 @@ public:
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
for(int64_t g=0;g<o.Grid()->_gsites;g++){ for(int g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor; Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor); o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);

View File

@@ -53,19 +53,36 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
hostVector<View> h_basis_v(basis.size()); Vector<View> basis_v; basis_v.reserve(basis.size());
deviceVector<View> d_basis_v(basis.size()); typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t; typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
h_basis_v[k] = basis[k].View(AcceleratorWrite); basis_v.push_back(basis[k].View(AcceleratorWrite));
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
View *basis_vp = &d_basis_v[0]; #if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
@@ -74,19 +91,17 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites(); uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
deviceVector <vobj> Bt(siteBlock * nrot); Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
hostVector<Coeff_t> h_Qt_jv(Nm*Nm); Vector<Coeff_t> Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0]; Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
h_Qt_jv[i]=Qt(j,k); Qt_p[i]=Qt(j,k);
}); });
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down // Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){ for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -122,8 +137,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
@@ -136,19 +152,16 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
hostVector<View> h_basis_v(basis.size()); Vector<View> basis_v; basis_v.reserve(basis.size());
deviceVector<View> d_basis_v(basis.size());
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
h_basis_v[k]=basis[k].View(AcceleratorRead); basis_v.push_back(basis[k].View(AcceleratorRead));
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
vobj zz=Zero(); vobj zz=Zero();
deviceVector<double> Qt_jv(Nm); Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k)); for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& d_basis_v[0]; auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite); autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero(); vobj zzz=Zero();
@@ -158,7 +171,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
template<class Field> template<class Field>
@@ -166,9 +179,9 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
{ {
int vlen = idx.size(); int vlen = idx.size();
GRID_ASSERT(vlen>=1); assert(vlen>=1);
GRID_ASSERT(vlen<=sort_vals.size()); assert(vlen<=sort_vals.size());
GRID_ASSERT(vlen<=_v.size()); assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) { for (size_t i=0;i<vlen;i++) {
@@ -186,7 +199,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
if (idx[j]==i) if (idx[j]==i)
break; break;
GRID_ASSERT(idx[i] > i); GRID_ASSERT(j!=idx.size()); GRID_ASSERT(idx[j]==i); assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]); std::swap(sort_vals[i],sort_vals[idx[i]]);
@@ -224,7 +237,7 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
template<class Field> template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) { void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero(); result = Zero();
GRID_ASSERT(_v.size()==eval.size()); assert(_v.size()==eval.size());
int N = (int)_v.size(); int N = (int)_v.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
Field& tmp = _v[i]; Field& tmp = _v[i];

View File

@@ -32,8 +32,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{ {
GRID_ASSERT(lhs.Grid() == rhs.Grid()); assert(lhs.Grid() == rhs.Grid());
GRID_ASSERT(lhs.Checkerboard() == rhs.Checkerboard()); assert(lhs.Checkerboard() == rhs.Checkerboard());
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1) template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
{ {
auto ff = localNorm2(f); auto ff = localNorm2(f);
if ( mu==-1 ) mu = f.Grid()->Nd()-1; if ( mu==-1 ) mu = f.Grid()->Nd()-1;

View File

@@ -42,7 +42,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -86,7 +86,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid(); GridBase *FullGrid = X.Grid();
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -140,7 +140,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
// int nl = nh-1; // int nl = nh-1;

View File

@@ -98,8 +98,8 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
GRID_ASSERT( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx; int rank,odx,idx;
// Optional to broadcast from node 0. // Optional to broadcast from node 0.
@@ -135,7 +135,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
GRID_ASSERT( l.Checkerboard() == l.Grid()->CheckerBoard(site)); assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx; int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -159,14 +159,14 @@ template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site) inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid = l.getGrid(); GridBase *grid = l.getGrid();
GRID_ASSERT(l.mode==CpuRead); assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
// GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
@@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
pt[w] = getlane(vp[w],idx); pt[w] = getlane(vp[w],idx);
} }
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
return; return;
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
@@ -195,15 +195,15 @@ template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site) inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid=l.getGrid(); GridBase *grid=l.getGrid();
GRID_ASSERT(l.mode==CpuWrite); assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
// GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;

View File

@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
std::vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
std::vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -204,27 +204,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm); return real(nrm);
} }
template<class Op,class T1>
inline auto norm2(const LatticeUnaryExpression<Op,T1> & expr) ->RealD
{
return norm2(closure(expr));
}
template<class Op,class T1,class T2>
inline auto norm2(const LatticeBinaryExpression<Op,T1,T2> & expr) ->RealD
{
return norm2(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto norm2(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) ->RealD
{
return norm2(closure(expr));
}
//The global maximum of the site norm2 //The global maximum of the site norm2
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg) template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
{ {
@@ -264,8 +243,24 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// Might make all code paths go this way. // Might make all code paths go this way.
#if 0
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
});
}
#else
typedef decltype(innerProduct(vobj(),vobj())) inner_t; typedef decltype(innerProduct(vobj(),vobj())) inner_t;
deviceVector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
{ {
@@ -279,6 +274,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
}); });
} }
#endif
// This is in single precision and fails some tests // This is in single precision and fails some tests
auto anrm = sumD(inner_tmp_v,sites); auto anrm = sumD(inner_tmp_v,sites);
nrm = anrm; nrm = anrm;
@@ -290,45 +286,23 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL #ifdef GRID_SYCL
// uint64_t csum=0; uint64_t csum=0;
// uint64_t csum2=0; if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
// if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) {
// {
// Hack // Hack
// Fast integer xor checksum. Can also be used in comms now. // Fast integer xor checksum. Can also be used in comms now.
// autoView(l_v,left,AcceleratorRead); autoView(l_v,left,AcceleratorRead);
// Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
// uint64_t *base= (uint64_t *)&l_v[0]; uint64_t *base= (uint64_t *)&l_v[0];
// csum=svm_xor(base,words); csum=svm_xor(base,words);
// ok = FlightRecorder::CsumLog(csum);
// if ( !ok ) {
// csum2=svm_xor(base,words);
// std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// } else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// }
// GRID_ASSERT(ok);
// }
#endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm);
ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
GRID_ASSERT(ok);
} }
FlightRecorder::StepLog("Start global sum"); FlightRecorder::CsumLog(csum);
grid->GlobalSumP2P(nrm); #endif
// grid->GlobalSum(nrm); ComplexD nrm = rankInnerProduct(left,right);
FlightRecorder::StepLog("Finished global sum"); RealD local = real(nrm);
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl; FlightRecorder::NormLog(real(nrm));
grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -365,9 +339,20 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead); autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead); autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
deviceVector<inner_t> inner_tmp; Vector<inner_t> inner_tmp(sites);
inner_tmp.resize(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, nsimd,{
@@ -375,13 +360,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
bool ok;
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm)); #endif
GRID_ASSERT(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -391,7 +372,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
conformable(left,right); conformable(left,right);
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
std::vector<ComplexD> tmp(2); Vector<ComplexD> tmp(2);
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
@@ -401,8 +382,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU // GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t;
deviceVector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
deviceVector<norm_t> norm_tmp(sites); Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{ {
@@ -452,9 +433,7 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data, template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
{ {
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// FIXME precision promoted summation // FIXME precision promoted summation
@@ -464,20 +443,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type; typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid(); GridBase *grid = Data.Grid();
GRID_ASSERT(grid!=NULL); assert(grid!=NULL);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
GRID_ASSERT(orthogdim >= 0); assert(orthogdim >= 0);
GRID_ASSERT(orthogdim < Nd); assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vobj> lvSum(rd); // will locally sum vectors first Vector<vobj> lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
@@ -525,8 +504,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
scalar_type * ptr = (scalar_type *) &result[0]; scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type); int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words); grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
} }
template<class vobj> inline template<class vobj> inline
std::vector<typename vobj::scalar_object> std::vector<typename vobj::scalar_object>
@@ -537,41 +514,28 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result; return result;
} }
/*
Reimplement
1)
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
2)
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
3)
-- Make Slice Mul Matrix call sliceMaddMatrix
*/
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid(); GridBase *grid = lhs.Grid();
GRID_ASSERT(grid!=NULL); assert(grid!=NULL);
conformable(grid,rhs.Grid()); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
GRID_ASSERT(orthogdim >= 0); assert(orthogdim >= 0);
GRID_ASSERT(orthogdim < Nd); assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type> lvSum(rd); // will locally sum vectors first Vector<vector_type> lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -701,96 +665,203 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
} }
}; };
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{ {
int NN = BlockSolverGrid->_ndimension; int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd(); int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(NN-1); std::vector<int> latt_phys(0);
Coordinate simd_phys; std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(NN-1); std::vector<int> mpi_phys(0);
Coordinate checker_dim_mask(NN-1);
int checker_dim=-1;
int dd;
for(int d=0;d<NN;d++){ for(int d=0;d<NN;d++){
if( d!=Orthog ) { if( d!=Orthog ) {
latt_phys[dd]=BlockSolverGrid->_fdimensions[d]; latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
mpi_phys[dd] =BlockSolverGrid->_processors[d]; simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d]; mpi_phys.push_back(BlockSolverGrid->_processors[d]);
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
dd++;
} }
} }
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd); return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
if(BlockSolverGrid->_isCheckerBoarded) {
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
delete tmp;
return (GridBase *) ret;
} else {
return (GridBase *) tmp;
}
} }
*/
template<class vobj> template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
GridBase *FullGrid = X.Grid();
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Ys(SliceGrid);
Lattice<vobj> Rs(SliceGrid);
Lattice<vobj> Xs(SliceGrid);
Lattice<vobj> RR(FullGrid);
RR = R; // Copies checkerboard for insert
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
for(int i=0;i<Nslice;i++){ int Nblock = X.Grid()->GlobalDimensions()[Orthog];
ExtractSlice(Ys,Y,i,Orthog);
ExtractSlice(Rs,R,i,Orthog); GridBase *FullGrid = X.Grid();
Rs=Ys; // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
for(int j=0;j<Nslice;j++){
ExtractSlice(Xs,X,j,Orthog); // Lattice<vobj> Xslice(SliceGrid);
Rs = Rs + Xs*(scale*aa(j,i)); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
} }
InsertSlice(Rs,RR,i,Orthog);
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
} }
R=RR; // Copy back handles arguments aliasing case
delete SliceGrid;
}; };
template<class vobj> template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
R=Zero(); typedef typename vobj::scalar_object sobj;
sliceMaddMatrix(R,aa,X,R,Orthog,scale); typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
}; };
template<class vobj> template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
Lattice<vobj> ls(SliceGrid);
Lattice<vobj> rs(SliceGrid);
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice); GridBase *FullGrid = lhs.Grid();
for(int s=0;s<Nslice;s++){ // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
ExtractSlice(ls,lhs,s,Orthog);
for(int ss=0;ss<Nslice;ss++){ int Nblock = FullGrid->GlobalDimensions()[Orthog];
ExtractSlice(rs,rhs,ss,Orthog);
mat(s,ss) = innerProduct(ls,rs); // Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
} }
} }
delete SliceGrid;
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -208,18 +208,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
Integer numThreads, numBlocks; Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
GRID_ASSERT(ok); assert(ok);
Integer smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM // Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream // Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise // as running this on the default stream fools the synchronise
deviceVector<sobj> buffer(numBlocks); #undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result; sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result; return result;
} }
@@ -234,7 +244,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
deviceVector<vector> buffer(osites); Vector<vector> buffer(osites);
vector *dat = (vector *)lat; vector *dat = (vector *)lat;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0]; iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];

View File

@@ -4,28 +4,29 @@ NAMESPACE_BEGIN(Grid);
// Possibly promote to double and sum // Possibly promote to double and sum
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD; typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity); sobj identity; zeroit(identity);
sobj ret; zeroit(ret); sobj ret ;
Integer nsimd= vobj::Nsimd(); Integer nsimd= vobj::Nsimd();
{
sycl::buffer<sobj, 1> abuff(&ret, {1}); theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
theGridAccelerator->submit([&](sycl::handler &cgh) { auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>()); cgh.parallel_for(cl::sycl::range<1>{osites},
cgh.parallel_for(sycl::range<1>{osites},
Reduction, Reduction,
[=] (sycl::id<1> item, auto &sum) { [=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0]; auto osite = item[0];
sum +=Reduce(lat[osite]); sum +=Reduce(lat[osite]);
}); });
}); });
} theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret); sobjD dret; convertType(dret,ret);
return dret; return dret;
} }
@@ -71,41 +72,55 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
template<class Word> Word svm_xor(Word *vec,uint64_t L) template<class Word> Word svm_xor(Word *vec,uint64_t L)
{ {
Word xorResult; xorResult = 0;
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
Word identity; identity=0; Word identity; identity=0;
Word ret = 0; theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
{ auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
sycl::buffer<Word, 1> abuff(&ret, {1}); cgh.parallel_for(cl::sycl::range<1>{L},
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction, Reduction,
[=] (sycl::id<1> index, auto &sum) { [=] (cl::sycl::id<1> index, auto &sum) {
sum ^=vec[index]; sum ^=vec[index];
}); });
}); });
}
theGridAccelerator->wait();
return ret;
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
auto l = index % 61;
sum ^= vec[index]<<l | vec[index]>>(64-l);
});
});
}
theGridAccelerator->wait(); theGridAccelerator->wait();
Word ret = d_sum[0];
free(d_sum,*theGridAccelerator);
return ret; return ret;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@@ -53,10 +53,10 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; int lowerdims = fine->_ndimension - coarse->_ndimension;
GRID_ASSERT(lowerdims >= 0); assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){ for(int d=0;d<lowerdims;d++){
GRID_ASSERT(fine->_simd_layout[d]==1); assert(fine->_simd_layout[d]==1);
GRID_ASSERT(fine->_processors[d]==1); assert(fine->_processors[d]==1);
} }
int multiplicity=1; int multiplicity=1;
@@ -66,9 +66,9 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){ for(int d=0;d<rngdims;d++){
int fd= d+lowerdims; int fd= d+lowerdims;
GRID_ASSERT(coarse->_processors[d] == fine->_processors[fd]); assert(coarse->_processors[d] == fine->_processors[fd]);
GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[fd]); assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
GRID_ASSERT(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
} }
@@ -83,18 +83,18 @@ inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; GRID_ASSERT(lowerdims >= 0); int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors // assumes that the higher dimensions are not using more processors
// all further divisions are local // all further divisions are local
for(int d=0;d<lowerdims;d++) GRID_ASSERT(fine->_processors[d]==1); for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) GRID_ASSERT(coarse->_processors[d] == fine->_processors[d+lowerdims]); for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites // then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same // check that the total number of sims agree, meanse the iSites are the same
GRID_ASSERT(fine->Nsimd() == coarse->Nsimd()); assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly // check that the two grids divide cleanly
GRID_ASSERT( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites(); return fine->lSites() / coarse->lSites();
} }
@@ -177,7 +177,7 @@ public:
skip = skip<<shift; skip = skip<<shift;
GRID_ASSERT((skip >> shift)==site); // check for overflow assert((skip >> shift)==site); // check for overflow
eng.discard(skip); eng.discard(skip);
#else #else
@@ -218,7 +218,7 @@ public:
GetState(saved,_generators[gen]); GetState(saved,_generators[gen]);
} }
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
GRID_ASSERT(saved.size()==RngStateCount); assert(saved.size()==RngStateCount);
std::stringstream ss; std::stringstream ss;
for(int i=0;i<RngStateCount;i++){ for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" "; ss<< saved[i]<<" ";
@@ -365,14 +365,9 @@ public:
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
} }
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
{ template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
if ( l.Grid()->_isCheckerBoarded ) {
Lattice<vobj> tmp(_grid);
fill(tmp,dist);
pickCheckerboard(l.Checkerboard(),l,tmp);
return;
}
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -435,7 +430,7 @@ public:
//////////////////////////////////////////////// ////////////////////////////////////////////////
thread_for( lidx, _grid->lSites(), { thread_for( lidx, _grid->lSites(), {
int64_t gidx; int gidx;
int o_idx; int o_idx;
int i_idx; int i_idx;
int rank; int rank;

View File

@@ -21,18 +21,9 @@ NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP) #if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
deviceVector<vobj> reduction_buffer(rd*subvol_size); commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
vobj zero_init; vobj zero_init;
zeroit(zero_init); zeroit(zero_init);
@@ -55,7 +46,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int))); d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device //copy offsets to device
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -88,7 +79,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream); acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy //sync after copy
accelerator_barrier(); accelerator_barrier();
@@ -103,15 +94,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
#if defined(GRID_SYCL) #if defined(GRID_SYCL)
template<class vobj> template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
@@ -122,7 +105,7 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
mysum[r] = vobj_zero; mysum[r] = vobj_zero;
} }
deviceVector<vobj> reduction_buffer(rd*subvol_size); commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
@@ -141,11 +124,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
}); });
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](sycl::handler &cgh) { theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = sycl::reduction(&mysum[r],std::plus<>()); auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(sycl::range<1>{subvol_size}, cgh.parallel_for(cl::sycl::range<1>{subvol_size},
Reduction, Reduction,
[=](sycl::id<1> item, auto &sum) { [=](cl::sycl::id<1> item, auto &sum) {
auto s = item[0]; auto s = item[0];
sum += rb_p[r*subvol_size+s]; sum += rb_p[r*subvol_size+s];
}); });
@@ -161,23 +144,14 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
} }
#endif #endif
template<class vobj> template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
typedef typename vobj::vector_type vector; typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2; const int osites = rd*e1*e2;
deviceVector<vector>buffer(osites); commVector<vector>buffer(osites);
vector *dat = (vector *)Data; vector *dat = (vector *)Data;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
std::vector<vector> lvSum_small(rd); Vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0]; vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) { for (int w = 0; w < words; w++) {
@@ -194,18 +168,13 @@ inline void sliceSumReduction_large(const vobj *Data,
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r]; lvSum_ptr[w+words*r]=lvSum_small[r];
} }
}
} }
template<class vobj>
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, }
std::vector<vobj> &lvSum,
const int rd, template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{ {
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) { if constexpr (sizeof(vobj) <= 256) {
@@ -223,15 +192,7 @@ inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
} }
template<class vobj> template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
@@ -247,19 +208,15 @@ inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
}); });
} }
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else #else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif #endif
} }

View File

@@ -31,15 +31,15 @@ NAMESPACE_BEGIN(Grid);
inline void subdivides(GridBase *coarse,GridBase *fine) inline void subdivides(GridBase *coarse,GridBase *fine)
{ {
GRID_ASSERT(coarse->_ndimension == fine->_ndimension); assert(coarse->_ndimension == fine->_ndimension);
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
GRID_ASSERT(coarse->_processors[d] == fine->_processors[d]); assert(coarse->_processors[d] == fine->_processors[d]);
GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[d]); assert(coarse->_simd_layout[d] == fine->_simd_layout[d]);
GRID_ASSERT((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
} }
@@ -276,40 +276,25 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite); autoView( ip_ , ip, AcceleratorWrite);
RealD t_IP=0;
RealD t_co=0;
RealD t_za=0;
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
t_IP-=usecond();
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine> blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
t_IP+=usecond();
t_co-=usecond();
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]); convertType(coarseData_[sc](v),ip_[sc]);
}); });
t_co+=usecond();
// improve numerical stability of projection // improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis> // |fine> = |fine> - <basis|fine> |basis>
ip=-ip; ip=-ip;
t_za-=usecond();
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
t_za+=usecond();
} }
// std::cout << GridLogPerformance << " blockProject : blockInnerProduct : "<<t_IP<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProject : conv : "<<t_co<<" us"<<std::endl;
// std::cout << GridLogPerformance << " blockProject : blockZaxpy : "<<t_za<<" us"<<std::endl;
} }
// This only minimises data motion from CPU to GPU
// there is chance of better implementation that does a vxk loop of inner products to data share
// at the GPU thread level
template<class vobj,class CComplex,int nbasis,class VLattice> template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData, inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData, const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = fineData.size(); int NBatch = fineData.size();
GRID_ASSERT(coarseData.size() == NBatch); assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid(); GridBase * coarse= coarseData[0].Grid();
@@ -344,7 +329,7 @@ template<class vobj,class vobj2,class CComplex>
GridBase * coarse= coarseA.Grid(); GridBase * coarse= coarseA.Grid();
fineZ.Checkerboard()=fineX.Checkerboard(); fineZ.Checkerboard()=fineX.Checkerboard();
GRID_ASSERT(fineX.Checkerboard()==fineY.Checkerboard()); assert(fineX.Checkerboard()==fineY.Checkerboard());
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
conformable(fineX,fineY); conformable(fineX,fineY);
conformable(fineX,fineZ); conformable(fineX,fineZ);
@@ -356,7 +341,7 @@ template<class vobj,class vobj2,class CComplex>
// FIXME merge with subdivide checking routine as this is redundant // FIXME merge with subdivide checking routine as this is redundant
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
GRID_ASSERT(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
autoView( fineZ_ , fineZ, AcceleratorWrite); autoView( fineZ_ , fineZ, AcceleratorWrite);
@@ -408,15 +393,8 @@ template<class vobj,class CComplex>
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion // Precision promotion
RealD t;
t=-usecond();
fine_inner = localInnerProductD<vobj>(fineX,fineY); fine_inner = localInnerProductD<vobj>(fineX,fineY);
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : localInnerProductD "<<t<<" us"<<std::endl;
t=-usecond();
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : blockSum "<<t<<" us"<<std::endl;
t=-usecond();
{ {
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite); autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead); autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
@@ -424,7 +402,6 @@ template<class vobj,class CComplex>
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
}); });
} }
// t+=usecond(); std::cout << GridLogPerformance << " blockInnerProduct : convertType "<<t<<" us"<<std::endl;
} }
@@ -467,9 +444,6 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
template<class vobj> template<class vobj>
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
{ {
const int maxsubsec=256;
typedef iVector<vobj,maxsubsec> vSubsec;
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
@@ -495,34 +469,16 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
Coordinate fine_rdimensions = fine->_rdimensions; Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero(); accelerator_for(sc,coarse->oSites(),1,{
// Somewhat lazy calculation
// Find the biggest power of two subsection divisor less than or equal to maxsubsec
int subsec=maxsubsec;
int subvol;
subvol=blockVol/subsec;
while(subvol*subsec!=blockVol){
subsec = subsec/2;
subvol=blockVol/subsec;
};
Lattice<vSubsec> coarseTmp(coarse);
autoView( coarseTmp_, coarseTmp, AcceleratorWriteDiscard);
auto coarseTmp_p= &coarseTmp_[0];
// Sum within subsecs in a first kernel
accelerator_for(sce,subsec*coarse->oSites(),vobj::Nsimd(),{
int sc=sce/subsec;
int e=sce%subsec;
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
auto cd = coalescedRead(zz); vobj cd = Zero();
for(int sb=e*subvol;sb<MIN((e+1)*subvol,blockVol);sb++){
for(int sb=0;sb<blockVol;sb++){
int sf; int sf;
Coordinate coor_b(_ndimension); Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
@@ -530,21 +486,12 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
cd=cd+coalescedRead(fineData_p[sf]); cd=cd+fineData_p[sf];
} }
coalescedWrite(coarseTmp_[sc](e),cd); coarseData_p[sc] = cd;
}); });
// Sum across subsecs in a second kernel
accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{
auto cd = coalescedRead(coarseTmp_p[sc](0));
for(int e=1;e<subsec;e++){
cd=cd+coalescedRead(coarseTmp_p[sc](e));
}
coalescedWrite(coarseData_p[sc],cd);
});
return; return;
} }
@@ -601,7 +548,7 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
blockOrthonormalize(ip,Basis); blockOrthonormalize(ip,Basis);
} }
#ifdef GRID_ACCELERATED #if 0
// TODO: CPU optimized version here // TODO: CPU optimized version here
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
@@ -613,7 +560,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// checks // checks
GRID_ASSERT( nbasis == Basis.size() ); assert( nbasis == Basis.size() );
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i].Grid(),fine); conformable(Basis[i].Grid(),fine);
@@ -627,37 +574,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
autoView( fineData_ , fineData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorWrite);
autoView( coarseData_ , coarseData, AcceleratorRead); autoView( coarseData_ , coarseData, AcceleratorRead);
typedef LatticeView<vobj> Vview;
std::vector<Vview> AcceleratorVecViewContainer_h;
for(int v=0;v<nbasis;v++) {
AcceleratorVecViewContainer_h.push_back(Basis[v].View(AcceleratorRead));
}
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(nbasis);
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],nbasis *sizeof(Vview));
auto Basis_p = &AcceleratorVecViewContainer[0];
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
Coordinate frdimensions=fine->_rdimensions; accelerator_for(sf,fine->oSites(),1,{
Coordinate crdimensions=coarse->_rdimensions;
accelerator_for(sf,fine->oSites(),vobj::Nsimd(),{
int sc; int sc;
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,frdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,crdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
auto sum= coarseData_(sc)(0) *Basis_p[0](sf); for(int i=0;i<nbasis;i++) {
for(int i=1;i<nbasis;i++) sum = sum + coarseData_(sc)(i)*Basis_p[i](sf); /* auto basis_ = Basis[i], );*/
coalescedWrite(fineData_[sf],sum); if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
}); else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
for(int v=0;v<nbasis;v++) {
AcceleratorVecViewContainer_h[v].ViewClose();
} }
});
return; return;
} }
#else #else
// CPU version
template<class vobj,class CComplex,int nbasis,class VLattice> template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
@@ -687,7 +623,7 @@ inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = coarseData.size(); int NBatch = coarseData.size();
GRID_ASSERT(fineData.size() == NBatch); assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid(); GridBase * coarse = coarseData[0].Grid();
@@ -715,12 +651,12 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
int ni = ig->_ndimension; int ni = ig->_ndimension;
int no = og->_ndimension; int no = og->_ndimension;
GRID_ASSERT(ni == no); assert(ni == no);
for(int d=0;d<no;d++){ for(int d=0;d<no;d++){
GRID_ASSERT(ig->_processors[d] == og->_processors[d]); assert(ig->_processors[d] == og->_processors[d]);
GRID_ASSERT(ig->_ldimensions[d] == og->_ldimensions[d]); assert(ig->_ldimensions[d] == og->_ldimensions[d]);
GRID_ASSERT(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead); autoView(in_v,in,CpuRead);
@@ -744,102 +680,53 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
//////////////////////////////////////////////////////////////////////////////////////////
// checks should guarantee that the operations are local
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded); assert(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded); assert(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
int nd = nF; int nd = nF;
GRID_ASSERT(nF == nT); assert(nF == nT);
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
GRID_ASSERT(Fg->_processors[d] == Tg->_processors[d]); assert(Fg->_processors[d] == Tg->_processors[d]);
} }
// the above should guarantee that the operations are local
/////////////////////////////////////////////////////////// #if 1
// do the index calc on the GPU
///////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
size_t nsite = 1; size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= RegionSize[i]; for(int i=0;i<nd;i++) nsite *= RegionSize[i];
typedef typename vobj::vector_type vector_type; size_t tbytes = 4*nsite*sizeof(int);
typedef typename vobj::scalar_type scalar_type; int *table = (int*)malloc(tbytes);
autoView(from_v,From,AcceleratorRead); thread_for(idx, nsite, {
autoView(to_v,To,AcceleratorWrite); Coordinate from_coor, to_coor;
size_t rem = idx;
accelerator_for(idx,nsite,1,{
Coordinate from_coor, to_coor, base;
Lexicographic::CoorFromIndex(base,idx,RegionSize);
for(int i=0;i<nd;i++){ for(int i=0;i<nd;i++){
from_coor[i] = base[i] + FromLowerLeft[i]; size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
to_coor[i] = base[i] + ToLowerLeft[i]; from_coor[i] = base_i + FromLowerLeft[i];
to_coor[i] = base_i + ToLowerLeft[i];
} }
int from_oidx = 0; for(int d=0;d<nd;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nd;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
int to_oidx = 0; for(int d=0;d<nd;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nd;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx]; int foidx = Fg->oIndex(from_coor);
vector_type* to = (vector_type *)&to_v[to_oidx]; int fiidx = Fg->iIndex(from_coor);
int toidx = Tg->oIndex(to_coor);
scalar_type stmp; int tiidx = Tg->iIndex(to_coor);
for(int w=0;w<words;w++){ int* tt = table + 4*idx;
stmp = getlane(from[w], from_lane); tt[0] = foidx;
putlane(to[w], stmp, to_lane); tt[1] = fiidx;
} tt[2] = toidx;
tt[3] = tiidx;
}); });
}
template<class vobj> int* table_d = (int*)acceleratorAllocDevice(tbytes);
void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int orthog) acceleratorCopyToDevice(table,table_d,tbytes);
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int words=sizeof(vobj)/sizeof(vector_type);
//////////////////////////////////////////////////////////////////////////////////////////
// checks should guarantee that the operations are local
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
GRID_ASSERT(nF+1 == nT);
///////////////////////////////////////////////////////////
// do the index calc on the GPU
///////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
Coordinate RegionSize = Fg->_ldimensions;
size_t nsite = 1;
for(int i=0;i<nF;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@@ -848,22 +735,12 @@ void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int
autoView(to_v,To,AcceleratorWrite); autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{ accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
Coordinate from_coor(nF), to_coor(nT); int* tt = table_d + 4*idx;
Lexicographic::CoorFromIndex(from_coor,idx,RegionSize); int from_oidx = *tt++;
int j=0; int from_lane = *tt++;
for(int i=0;i<nT;i++){ int to_oidx = *tt++;
if ( i!=orthog ) { int to_lane = *tt;
to_coor[i] = from_coor[j];
j++;
} else {
to_coor[i] = slice;
}
}
int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]);
int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]);
int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]);
int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]);
const vector_type* from = (const vector_type *)&from_v[from_oidx]; const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx]; vector_type* to = (vector_type *)&to_v[to_oidx];
@@ -874,77 +751,56 @@ void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int
putlane(to[w], stmp, to_lane); putlane(to[w], stmp, to_lane);
} }
}); });
acceleratorFreeDevice(table_d);
free(table);
#else
Coordinate ldf = Fg->_ldimensions;
Coordinate rdf = Fg->_rdimensions;
Coordinate isf = Fg->_istride;
Coordinate osf = Fg->_ostride;
Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
autoView( t_v , To, CpuWrite);
autoView( f_v , From, CpuRead);
thread_for(idx,Fg->lSites(),{
sobj s;
Coordinate Fcoor(nd);
Coordinate Tcoor(nd);
Lexicographic::CoorFromIndex(Fcoor,idx,ldf);
int in_region=1;
for(int d=0;d<nd;d++){
if ( (Fcoor[d] < FromLowerLeft[d]) || (Fcoor[d]>=FromLowerLeft[d]+RegionSize[d]) ){
in_region=0;
} }
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
template<class vobj>
void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, int orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int words=sizeof(vobj)/sizeof(vector_type);
//////////////////////////////////////////////////////////////////////////////////////////
// checks should guarantee that the operations are local
//////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension;
int nT = Tg->_ndimension;
GRID_ASSERT(nT+1 == nF);
///////////////////////////////////////////////////////////
// do the index calc on the GPU
///////////////////////////////////////////////////////////
Coordinate f_ostride = Fg->_ostride;
Coordinate f_istride = Fg->_istride;
Coordinate f_rdimensions = Fg->_rdimensions;
Coordinate t_ostride = Tg->_ostride;
Coordinate t_istride = Tg->_istride;
Coordinate t_rdimensions = Tg->_rdimensions;
Coordinate RegionSize = Tg->_ldimensions;
size_t nsite = 1;
for(int i=0;i<nT;i++) nsite *= RegionSize[i]; // whole volume of lower dim grid
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
Coordinate from_coor(nF), to_coor(nT);
Lexicographic::CoorFromIndex(to_coor,idx,RegionSize);
int j=0;
for(int i=0;i<nF;i++){
if ( i!=orthog ) {
from_coor[i] = to_coor[j];
j++;
} else {
from_coor[i] = slice;
} }
} if (in_region) {
int from_oidx = 0; for(int d=0;d<nF;d++) from_oidx+=f_ostride[d]*(from_coor[d]%f_rdimensions[d]); #if 0
int from_lane = 0; for(int d=0;d<nF;d++) from_lane+=f_istride[d]*(from_coor[d]/f_rdimensions[d]); Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
int to_oidx = 0; for(int d=0;d<nT;d++) to_oidx+=t_ostride[d]*(to_coor[d]%t_rdimensions[d]); Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
int to_lane = 0; for(int d=0;d<nT;d++) to_lane+=t_istride[d]*(to_coor[d]/t_rdimensions[d]); Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
const vector_type* from = (const vector_type *)&from_v[from_oidx]; scalar_type * fp = (scalar_type *)&f_v[odx_f];
vector_type* to = (vector_type *)&to_v[to_oidx]; scalar_type * tp = (scalar_type *)&t_v[odx_t];
scalar_type stmp;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane); tp[w].putlane(fp[w].getlane(idx_f),idx_t);
putlane(to[w], stmp, to_lane); }
#else
peekLocalSite(s,f_v,Fcoor);
pokeLocalSite(s,t_v,Tcoor);
#endif
} }
}); });
#endif
} }
template<class vobj> template<class vobj>
void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
{ {
@@ -955,16 +811,16 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
GRID_ASSERT(nl+1 == nh); assert(nl+1 == nh);
GRID_ASSERT(orthog<nh); assert(orthog<nh);
GRID_ASSERT(orthog>=0); assert(orthog>=0);
GRID_ASSERT(hg->_processors[orthog]==1); assert(hg->_processors[orthog]==1);
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]); assert(lg->_processors[dl] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]); assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -981,14 +837,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[orthog] = slice; hcoor[orthog] = slice;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl]; hcoor[d]=lcoor[ddl++];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
} }
ddl++;
}
} }
peekLocalSite(s,lowDimv,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDimv,hcoor);
@@ -1005,17 +855,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
GRID_ASSERT(nl+1 == nh); assert(nl+1 == nh);
GRID_ASSERT(orthog<nh); assert(orthog<nh);
GRID_ASSERT(orthog>=0); assert(orthog>=0);
GRID_ASSERT(hg->_processors[orthog]==1); assert(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]); assert(lg->_processors[dl] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]); assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -1027,16 +876,11 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
Coordinate lcoor(nl); Coordinate lcoor(nl);
Coordinate hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
hcoor[orthog] = slice;
int ddl=0; int ddl=0;
hcoor[orthog] = slice;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl]; hcoor[d]=lcoor[ddl++];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
} }
} }
peekLocalSite(s,higherDimv,hcoor); peekLocalSite(s,higherDimv,hcoor);
@@ -1045,7 +889,9 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
//Can I implement with local copyregion??
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
template<class vobj> template<class vobj>
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{ {
@@ -1056,28 +902,131 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
GRID_ASSERT(nl == nh); assert(nl == nh);
GRID_ASSERT(orthog<nh); assert(orthog<nh);
GRID_ASSERT(orthog>=0); assert(orthog>=0);
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
GRID_ASSERT(lg->_processors[d] == hg->_processors[d]); assert(lg->_processors[d] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[d] == hg->_ldimensions[d]); assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
} }
} }
Coordinate sz = lg->_ldimensions;
sz[orthog]=1; #if 1
Coordinate f_ll(nl,0); f_ll[orthog]=slice_lo; size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
Coordinate t_ll(nh,0); t_ll[orthog]=slice_hi; size_t tbytes = 4*nsite*sizeof(int);
localCopyRegion(lowDim,higherDim,f_ll,t_ll,sz); int *table = (int*)malloc(tbytes);
thread_for(idx,nsite,{
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lcoor[orthog] = slice_lo;
hcoor[orthog] = slice_hi;
size_t rem = idx;
for(int mu=0;mu<nl;mu++){
if(mu != orthog){
int xmu = rem % lg->LocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu];
lcoor[mu] = hcoor[mu] = xmu;
}
}
int loidx = lg->oIndex(lcoor);
int liidx = lg->iIndex(lcoor);
int hoidx = hg->oIndex(hcoor);
int hiidx = hg->iIndex(hcoor);
int* tt = table + 4*idx;
tt[0] = loidx;
tt[1] = liidx;
tt[2] = hoidx;
tt[3] = hiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(lowDim_v,lowDim,AcceleratorRead);
autoView(higherDim_v,higherDim,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
int* tt = table_d + 4*idx;
int from_oidx = *tt++;
int from_lane = *tt++;
int to_oidx = *tt++;
int to_lane = *tt;
const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
vector_type* to = (vector_type *)&higherDim_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
acceleratorFreeDevice(table_d);
free(table);
#else
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor);
}
});
#endif
} }
template<class vobj> template<class vobj>
void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{ {
InsertSliceLocal(higherDim,lowDim,slice_hi,slice_lo,orthog); typedef typename vobj::scalar_object sobj;
GridBase *lg = lowDim.Grid();
GridBase *hg = higherDim.Grid();
int nl = lg->_ndimension;
int nh = hg->_ndimension;
assert(nl == nh);
assert(orthog<nh);
assert(orthog>=0);
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
}
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{
sobj s;
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDimv,lcoor);
}
});
} }
@@ -1093,7 +1042,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
subdivides(cg,fg); subdivides(cg,fg);
GRID_ASSERT(cg->_ndimension==fg->_ndimension); assert(cg->_ndimension==fg->_ndimension);
Coordinate ratio(cg->_ndimension); Coordinate ratio(cg->_ndimension);
@@ -1103,7 +1052,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
Coordinate fcoor(nd); Coordinate fcoor(nd);
Coordinate ccoor(nd); Coordinate ccoor(nd);
for(int64_t g=0;g<fg->gSites();g++){ for(int g=0;g<fg->gSites();g++){
fg->GlobalIndexToGlobalCoor(g,fcoor); fg->GlobalIndexToGlobalCoor(g,fcoor);
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
@@ -1157,7 +1106,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
int lex; int lex;
Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions); Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
GRID_ASSERT(lex < out.size()); assert(lex < out.size());
out_ptrs[lane] = &out[lex]; out_ptrs[lane] = &out[lex];
} }
@@ -1221,7 +1170,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out.Grid(); GridBase* grid = out.Grid();
GRID_ASSERT(in.size()==grid->lSites()); assert(in.size()==grid->lSites());
const int ndim = grid->Nd(); const int ndim = grid->Nd();
constexpr int nsimd = vtype::Nsimd(); constexpr int nsimd = vtype::Nsimd();
@@ -1268,7 +1217,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid; GridBase* grid = out._grid;
GRID_ASSERT(in.size()==grid->lSites()); assert(in.size()==grid->lSites());
int ndim = grid->Nd(); int ndim = grid->Nd();
int nsimd = vtype::Nsimd(); int nsimd = vtype::Nsimd();
@@ -1329,9 +1278,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in) void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{ {
GRID_ASSERT(out.Grid()->Nd() == in.Grid()->Nd()); assert(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){ for(int d=0;d<out.Grid()->Nd();d++){
GRID_ASSERT(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]); assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
} }
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid(); GridBase *in_grid=in.Grid();
@@ -1382,9 +1331,9 @@ class precisionChangeWorkspace{
public: public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){ precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
GRID_ASSERT(out_grid->Nd() == in_grid->Nd()); assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){ for(int d=0;d<out_grid->Nd();d++){
GRID_ASSERT(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
} }
int Nsimd_out = out_grid->Nsimd(); int Nsimd_out = out_grid->Nsimd();
@@ -1549,7 +1498,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
GRID_ASSERT(full_vecs>=1); assert(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1567,18 +1516,18 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension); assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
GRID_ASSERT(full[n].Checkerboard() == cb); assert(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
GRID_ASSERT(nvector*split_nproc==full_nproc); assert(nvector*split_nproc==full_nproc);
GRID_ASSERT(nvector == full_vecs); assert(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1622,7 +1571,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
// Loop over reordered data post A2A // Loop over reordered data post A2A
thread_for(c, chunk, { thread_for(c, chunk, {
@@ -1675,7 +1624,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
GRID_ASSERT(full_vecs>=1); assert(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1693,18 +1642,18 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension); assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
GRID_ASSERT(full[n].Checkerboard() == cb); assert(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
GRID_ASSERT(nvector*split_nproc==full_nproc); assert(nvector*split_nproc==full_nproc);
GRID_ASSERT(nvector == full_vecs); assert(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1740,7 +1689,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
auto lsites= rsites/M; // Decreases rsites by M auto lsites= rsites/M; // Decreases rsites by M
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
{ {
// Loop over reordered data post A2A // Loop over reordered data post A2A
@@ -1789,35 +1738,5 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
} }
} }
//////////////////////////////////////////////////////
// Faster but less accurate blockProject
//////////////////////////////////////////////////////
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProjectFast(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const VLattice &Basis)
{
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
Lattice<iScalar<CComplex> > ip(coarse);
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite);
RealD t_IP=0;
RealD t_co=0;
for(int v=0;v<nbasis;v++) {
t_IP-=usecond();
blockInnerProductD(ip,Basis[v],fineData);
t_IP+=usecond();
t_co-=usecond();
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
t_co+=usecond();
}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -106,47 +106,6 @@ public:
} }
}; };
#ifdef GRID_LOG_VIEWS
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
const char* filename; int line, mode;
public:
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
v(_v), filename(_filename), line(_line), mode(_mode) {
switch (mode){
case AcceleratorRead:
case AcceleratorWrite:
case CpuRead:
case CpuWrite:
ViewLogger::LogOpen(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
};
~ViewCloser() {
switch (mode) {
case AcceleratorWriteDiscard:
case AcceleratorWrite:
case CpuWrite:
ViewLogger::LogClose(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
v.ViewClose();
}
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
#else
// Little autoscope assister // Little autoscope assister
template<class View> template<class View>
class ViewCloser class ViewCloser
@@ -160,7 +119,6 @@ class ViewCloser
#define autoView(l_v,l,mode) \ #define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \ auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v); ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
#endif
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST // Lattice expression types used by ET to assemble the AST

View File

@@ -45,188 +45,6 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); } typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
}; };
/*
*
* TODO:
* -- address elementsof vobj via thread block in Scatter/Gather
* -- overlap comms with motion in Face_exchange
*
*/
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
Lattice<vobj> &lat,
int x,
int dim,
int offset=0)
{
const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *grid = lat.Grid();
Coordinate simd = grid->_simd_layout;
int Nd = grid->Nd();
int block = grid->_slice_block[dim];
int stride = grid->_slice_stride[dim];
int nblock = grid->_slice_nblock[dim];
int rd = grid->_rdimensions[dim];
int ox = x%rd;
int ix = x/rd;
int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int rNsimda= Nsimd/simd[dim]; // should be equal
GRID_ASSERT(rNsimda==rNsimd);
int face_ovol=block*nblock;
// GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
//for cross platform
// FIXME -- can put internal indices into thread loop
auto buf_p = & buf[0];
autoView(lat_v, lat, AcceleratorWrite);
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
// scalar layout won't coalesce
#ifdef GRID_SIMT
{
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int blane=0;blane<Nsimd;blane++) {
#endif
int olane=blane%rNsimd; // reduced lattice lane
int obit =blane/rNsimd;
///////////////////////////////////////////////////////////////
// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
///////////////////////////////////////////////////////////////
int ssp = ss*simd[dim]+obit;
int b = ssp%block;
int n = ssp/block;
int osite= b+n*stride + ox*block;
////////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate icoor;
int lane;
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
icoor[dim]=ix;
Lexicographic::IndexFromCoor(icoor,lane,simd);
///////////////////////////////////////////
// Transfer into lattice - will coalesce
///////////////////////////////////////////
// sobj obj = extractLane(blane,buf_p[ss+offset]);
// insertLane(lane,lat_v[osite],obj);
const int words=sizeof(vobj)/sizeof(vector_type);
vector_type * from = (vector_type *)&buf_p[ss+offset];
vector_type * to = (vector_type *)&lat_v[osite];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], blane);
putlane(to[w], stmp, lane);
}
}
});
}
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
const Lattice<vobj> &lat,
int x,
int dim,
int offset=0)
{
const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
autoView(lat_v, lat, AcceleratorRead);
GridBase *grid = lat.Grid();
Coordinate simd = grid->_simd_layout;
int Nd = grid->Nd();
int block = grid->_slice_block[dim];
int stride = grid->_slice_stride[dim];
int nblock = grid->_slice_nblock[dim];
int rd = grid->_rdimensions[dim];
int ox = x%rd;
int ix = x/rd;
int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int face_ovol=block*nblock;
// GRID_ASSERT(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
//for cross platform
//For CPU perhaps just run a loop over Nsimd
auto buf_p = & buf[0];
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
// scalar layout won't coalesce
#ifdef GRID_SIMT
{
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int blane=0;blane<Nsimd;blane++) {
#endif
int olane=blane%rNsimd; // reduced lattice lane
int obit =blane/rNsimd;
////////////////////////////////////////////
// osite
////////////////////////////////////////////
int ssp = ss*simd[dim]+obit;
int b = ssp%block;
int n = ssp/block;
int osite= b+n*stride + ox*block;
////////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate icoor;
int lane;
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
icoor[dim]=ix;
Lexicographic::IndexFromCoor(icoor,lane,simd);
///////////////////////////////////////////
// Take out of lattice
///////////////////////////////////////////
// sobj obj = extractLane(lane,lat_v[osite]);
// insertLane(blane,buf_p[ss+offset],obj);
const int words=sizeof(vobj)/sizeof(vector_type);
vector_type * to = (vector_type *)&buf_p[ss+offset];
vector_type * from = (vector_type *)&lat_v[osite];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], lane);
putlane(to[w], stmp, blane);
}
}
});
}
class PaddedCell { class PaddedCell {
public: public:
GridCartesian * unpadded_grid; GridCartesian * unpadded_grid;
@@ -245,19 +63,15 @@ public:
dims=_grid->Nd(); dims=_grid->Nd();
AllocateGrids(); AllocateGrids();
Coordinate local =unpadded_grid->LocalDimensions(); Coordinate local =unpadded_grid->LocalDimensions();
Coordinate procs =unpadded_grid->ProcessorGrid();
for(int d=0;d<dims;d++){ for(int d=0;d<dims;d++){
if ( procs[d] > 1 ) GRID_ASSERT(local[d]>=depth); assert(local[d]>=depth);
} }
} }
void DeleteGrids(void) void DeleteGrids(void)
{ {
Coordinate processors=unpadded_grid->_processors;
for(int d=0;d<grids.size();d++){ for(int d=0;d<grids.size();d++){
if ( processors[d] > 1 ) {
delete grids[d]; delete grids[d];
} }
}
grids.resize(0); grids.resize(0);
}; };
void AllocateGrids(void) void AllocateGrids(void)
@@ -267,36 +81,27 @@ public:
Coordinate processors=unpadded_grid->_processors; Coordinate processors=unpadded_grid->_processors;
Coordinate plocal =unpadded_grid->LocalDimensions(); Coordinate plocal =unpadded_grid->LocalDimensions();
Coordinate global(dims); Coordinate global(dims);
GridCartesian *old_grid = unpadded_grid;
// expand up one dim at a time // expand up one dim at a time
for(int d=0;d<dims;d++){ for(int d=0;d<dims;d++){
if ( processors[d] > 1 ) {
plocal[d] += 2*depth; plocal[d] += 2*depth;
for(int d=0;d<dims;d++){ for(int d=0;d<dims;d++){
global[d] = plocal[d]*processors[d]; global[d] = plocal[d]*processors[d];
} }
old_grid = new GridCartesian(global,simd,processors); grids.push_back(new GridCartesian(global,simd,processors));
}
grids.push_back(old_grid);
} }
}; };
template<class vobj> template<class vobj>
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
{ {
Coordinate processors=unpadded_grid->_processors;
Lattice<vobj> out(unpadded_grid); Lattice<vobj> out(unpadded_grid);
Coordinate local =unpadded_grid->LocalDimensions(); Coordinate local =unpadded_grid->LocalDimensions();
// depends on the MPI spread Coordinate fll(dims,depth); // depends on the MPI spread
Coordinate fll(dims,depth);
Coordinate tll(dims,0); // depends on the MPI spread Coordinate tll(dims,0); // depends on the MPI spread
for(int d=0;d<dims;d++){
if( processors[d]==1 ) fll[d]=0;
}
localCopyRegion(in,out,fll,tll,local); localCopyRegion(in,out,fll,tll,local);
return out; return out;
} }
@@ -311,22 +116,10 @@ public:
} }
return tmp; return tmp;
} }
template<class vobj>
inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in) const
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = ExpandPeriodic(d,tmp); // rvalue && assignment
}
return tmp;
}
// expand up one dim at a time // expand up one dim at a time
template<class vobj> template<class vobj>
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{ {
Coordinate processors=unpadded_grid->_processors;
GridBase *old_grid = in.Grid(); GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid); Lattice<vobj> padded(new_grid);
@@ -336,30 +129,10 @@ public:
if(dim==0) conformable(old_grid,unpadded_grid); if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]); else conformable(old_grid,grids[dim-1]);
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0; double tins=0, tshift=0;
int islocal = 0 ;
if ( processors[dim] == 1 ) islocal = 1;
if ( islocal ) {
// replace with a copy and maybe grid swizzle
// return in;??
double t = usecond();
padded = in;
tins += usecond() - t;
} else {
//////////////////////////////////////////////
// Replace sequence with
// ---------------------
// (i) Gather high face(s); start comms
// (ii) Gather low face(s); start comms
// (iii) Copy middle bit with localCopyRegion
// (iv) Complete high face(s), insert slice(s)
// (iv) Complete low face(s), insert slice(s)
//////////////////////////////////////////////
// Middle bit // Middle bit
double t = usecond(); double t = usecond();
for(int x=0;x<local[dim];x++){ for(int x=0;x<local[dim];x++){
@@ -389,213 +162,13 @@ public:
} }
tins += usecond() - t; tins += usecond() - t;
}
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl; std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
return padded; return padded;
} }
template<class vobj>
inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in) const
{
Coordinate processors=unpadded_grid->_processors;
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
// Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
// std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0;
int islocal = 0 ;
if ( processors[dim] == 1 ) islocal = 1;
if ( islocal ) {
padded=in; // slightly different interface could avoid a copy operation
} else {
Face_exchange(in,padded,dim,depth);
return padded;
}
return padded;
}
template<class vobj>
void Face_exchange(const Lattice<vobj> &from,
Lattice<vobj> &to,
int dimension,int depth) const
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::scalar_object sobj;
RealD t_gather=0.0;
RealD t_scatter=0.0;
RealD t_comms=0.0;
RealD t_copy=0.0;
// std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
// DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
GridBase *grid=from.Grid();
GridBase *new_grid=to.Grid();
Coordinate lds = from.Grid()->_ldimensions;
Coordinate nlds= to.Grid()->_ldimensions;
Coordinate simd= from.Grid()->_simd_layout;
int ld = lds[dimension];
int nld = to.Grid()->_ldimensions[dimension];
const int Nsimd = vobj::Nsimd();
GRID_ASSERT(depth<=lds[dimension]); // A must be on neighbouring node
GRID_ASSERT(depth>0); // A caller bug if zero
GRID_ASSERT(ld+2*depth==nld);
////////////////////////////////////////////////////////////////////////////
// Face size and byte calculations
////////////////////////////////////////////////////////////////////////////
int buffer_size = 1;
for(int d=0;d<lds.size();d++){
if ( d!= dimension) buffer_size=buffer_size*lds[d];
}
buffer_size = buffer_size / Nsimd;
int rNsimd = Nsimd / simd[dimension];
GRID_ASSERT( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static deviceVector<vobj> send_buf;
static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<MpiCommsRequest_t> fwd_req;
std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size;
int bytes = words * sizeof(vobj);
////////////////////////////////////////////////////////////////////////////
// Communication coords
////////////////////////////////////////////////////////////////////////////
int comm_proc = 1;
int xmit_to_rank;
int recv_from_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
////////////////////////////////////////////////////////////////////////////
// Gather all surface terms up to depth "d"
////////////////////////////////////////////////////////////////////////////
RealD t;
RealD t_tot=-usecond();
int plane=0;
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+0;
t=usecond();
GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
t_gather+=usecond()-t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t;
}
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+1;
t=usecond();
GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
t_gather+= usecond() - t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t;
}
////////////////////////////////////////////////////////////////////////////
// Copy interior -- overlap this with comms
////////////////////////////////////////////////////////////////////////////
int Nd = new_grid->Nd();
Coordinate LL(Nd,0);
Coordinate sz = grid->_ldimensions;
Coordinate toLL(Nd,0);
toLL[dimension]=depth;
t=usecond();
localCopyRegion(from,to,LL,toLL,sz);
t_copy= usecond() - t;
////////////////////////////////////////////////////////////////////////////
// Scatter all faces
////////////////////////////////////////////////////////////////////////////
plane=0;
t=usecond();
grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
}
t_scatter= usecond() - t;
t=usecond();
grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
}
t_scatter+= usecond() - t;
t_tot+=usecond();
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: total :" << t_tot/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << depth*4.0*bytes/t_gather << "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << depth*4.0*bytes/t_scatter<< "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: face bytes :" << depth*bytes/1e6 << "MB"<<std::endl;
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -69,7 +69,6 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL"); GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogComms (1, "Comms", GridLogColours, "BLUE");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
@@ -85,7 +84,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogDslash.Active(0); GridLogDslash.Active(0);
GridLogComms.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1); GridLogHMC.Active(1);
@@ -99,7 +97,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);

View File

@@ -33,6 +33,10 @@
#ifndef GRID_LOG_H #ifndef GRID_LOG_H
#define GRID_LOG_H #define GRID_LOG_H
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -176,7 +180,6 @@ extern GridLogger GridLogError;
extern GridLogger GridLogWarning; extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage; extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug; extern GridLogger GridLogDebug;
extern GridLogger GridLogComms;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash; extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative; extern GridLogger GridLogIterative;
@@ -223,6 +226,8 @@ inline void Grid_pass(Args&&... args) {
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl; std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
} }
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() { \ #define BACKTRACEFILE() { \
char string[20]; \ char string[20]; \

Some files were not shown because too many files have changed in this diff Show More