1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-13 01:05:36 +00:00

merge upstream develop

This commit is contained in:
nmeyer-ur 2020-07-07 20:26:47 +02:00
commit 8726e94ea7
326 changed files with 10335 additions and 9381 deletions

View File

@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h> #include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h> #include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/Threads.h> #include <Grid/threads/ThreadReduction.h>
#include <Grid/serialisation/Serialisation.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>

View File

@ -6,6 +6,7 @@
/////////////////// ///////////////////
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory>
#include <vector> #include <vector>
#include <array> #include <array>
#include <string> #include <string>

View File

@ -18,12 +18,23 @@
#pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__") #pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__") #pragma push_macro("__CUDACC__")
#undef __CUDA_ARCH__
#undef __NVCC__ #undef __NVCC__
#undef __CUDACC__ #undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__ #define __NVCC__REDEFINE__
#endif
/* SYCL save and restore compile environment*/
#ifdef GRID_SYCL
#pragma push
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif #endif
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor> #include <Grid/Eigen/unsupported/CXX11/Tensor>
@ -31,7 +42,13 @@
#ifdef __NVCC__REDEFINE__ #ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__") #pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__") #pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__") #pragma pop_macro("GRID_SIMT")
#pragma pop
#endif
/*SYCL restore*/
#ifdef __SYCL__REDEFINE__
#pragma pop_macro("__SYCL_DEVICE_ONLY__")
#pragma pop #pragma pop
#endif #endif
@ -39,3 +56,4 @@
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@ -21,7 +21,7 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h extra_headers+=serialisation/Hdf5Type.h
endif endif
all: version-cache all: version-cache Version.h
version-cache: version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ @if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@ -42,7 +42,7 @@ version-cache:
fi;\ fi;\
rm -f vertmp rm -f vertmp
Version.h: Version.h: version-cache
cp version-cache Version.h cp version-cache Version.h
.PHONY: version-cache .PHONY: version-cache

View File

@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H #ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h> #include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h> #include <Grid/algorithms/Preconditioner.h>
NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Zolotarev.h> #include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h> #include <Grid/algorithms/approx/Chebyshev.h>
@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Forecast.h> #include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/approx/RemezGeneral.h> #include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h> #include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/iterative/Deflation.h> #include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h> #include <Grid/algorithms/iterative/BiCGSTAB.h>
NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateResidual.h> #include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h> #include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/CoarsenedMatrix.h> #include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
#endif #endif

View File

@ -1,14 +1,3 @@
// blockZaxpy in bockPromote - 3s, 5%
// noncoalesced linalg in Preconditionoer ~ 3s 5%
// Lancos tuning or replace 10-20s ~ 25%, open ended
// setup tuning 5s ~ 8%
// -- e.g. ordermin, orderstep tunables.
// MdagM path without norm in LinOp code. few seconds
// Mdir calc blocking kernels
// Fuse kernels in blockMaskedInnerProduct
// preallocate Vectors in Cayley 5D ~ few percent few seconds
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -91,34 +80,7 @@ public:
} }
directions [2*_d]=0; directions [2*_d]=0;
displacements[2*_d]=0; displacements[2*_d]=0;
//// report back
std::cout<<GridLogMessage<<"directions :";
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
std::cout<<std::endl;
std::cout<<GridLogMessage<<"displacements :";
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
std::cout<<std::endl;
} }
/*
// Original cleaner code
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
for(int d=0;d<dimension;d++){
directions[2*d ] = d;
directions[2*d+1] = d;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*dimension]=0;
displacements[2*dimension]=0;
}
std::vector<int> GetDelta(int point) {
std::vector<int> delta(dimension,0);
delta[directions[point]] = displacements[point];
return delta;
};
*/
}; };
@ -149,25 +111,7 @@ public:
CoarseScalar InnerProd(CoarseGrid); CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl; std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace); blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
// blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal();
} }
void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace);
eProj=Zero();
accelerator_for(ss, CoarseGrid->oSites(),1,{
eProj[ss](i)=CComplex(1.0);
});
eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
}
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){ void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace); blockProject(CoarseVec,FineVec,subspace);
} }
@ -175,11 +119,6 @@ public:
FineVec.Checkerboard() = subspace[0].Checkerboard(); FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace); blockPromote(CoarseVec,FineVec,subspace);
} }
void CreateSubspaceRandom(GridParallelRNG &RNG){
for(int i=0;i<nbasis;i++){
random(RNG,subspace[i]);
}
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) { virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
@ -190,12 +129,12 @@ public:
FineField Mn(FineGrid); FineField Mn(FineGrid);
for(int b=0;b<nn;b++){ for(int b=0;b<nn;b++){
subspace[b] = Zero(); subspace[b] = Zero();
gaussian(RNG,noise); gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5); scale = std::pow(norm2(noise),-0.5);
noise=noise*scale; noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl; hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){ for(int i=0;i<1;i++){
@ -218,7 +157,7 @@ public:
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found // and this is the best I found
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
#if 1
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop, virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn, int nn,
double hi, double hi,
@ -280,10 +219,10 @@ public:
hermop.HermOp(*Tn,y); hermop.HermOp(*Tn,y);
auto y_v = y.View(); autoView( y_v , y, AcceleratorWrite);
auto Tn_v = Tn->View(); autoView( Tn_v , (*Tn), AcceleratorWrite);
auto Tnp_v = Tnp->View(); autoView( Tnp_v , (*Tnp), AcceleratorWrite);
auto Tnm_v = Tnm->View(); autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
@ -313,201 +252,6 @@ public:
} }
assert(b==nn); assert(b==nn);
} }
#endif
#if 0
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
FineField combined(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
#define FILTERb(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
Cheb(hermop,noise,Mn); \
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
subspace[b] = Mn; \
hermop.Op(Mn,tmp); \
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
RealD alpha=-0.8;
RealD beta =-0.8;
#define FILTER(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
Cheb(hermop,noise,Mn); \
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
subspace[b] = Mn; \
hermop.Op(Mn,tmp); \
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
#define FILTERc(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
Cheb(hermop,noise,combined); \
}
double node = 0.000;
FILTERb(lo,hi,orderfilter);// 0
// FILTERc(node,hi,51);// 0
noise = Mn;
int base = 0;
int mult = 100;
FILTER(node,hi,base+1*mult);
FILTER(node,hi,base+2*mult);
FILTER(node,hi,base+3*mult);
FILTER(node,hi,base+4*mult);
FILTER(node,hi,base+5*mult);
FILTER(node,hi,base+6*mult);
FILTER(node,hi,base+7*mult);
FILTER(node,hi,base+8*mult);
FILTER(node,hi,base+9*mult);
FILTER(node,hi,base+10*mult);
FILTER(node,hi,base+11*mult);
FILTER(node,hi,base+12*mult);
FILTER(node,hi,base+13*mult);
FILTER(node,hi,base+14*mult);
FILTER(node,hi,base+15*mult);
assert(b==nn);
}
#endif
#if 0
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
FineField combined(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
JacobiPoly(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
// subspace[b] = tmp; b++;
// }
}
#define FILTER(lambda) \
{ \
hermop.HermOp(subspace[0],tmp); \
tmp = tmp - lambda *subspace[0]; \
scale = std::pow(norm2(tmp),-0.5); \
tmp=tmp*scale; \
subspace[b] = tmp; \
hermop.Op(subspace[b],tmp); \
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
// subspace[b] = tmp; b++;
// }
FILTER(2.0e-5);
FILTER(2.0e-4);
FILTER(4.0e-4);
FILTER(8.0e-4);
FILTER(8.0e-4);
FILTER(2.0e-3);
FILTER(3.0e-3);
FILTER(4.0e-3);
FILTER(5.0e-3);
FILTER(6.0e-3);
FILTER(2.5e-3);
FILTER(3.5e-3);
FILTER(4.5e-3);
FILTER(5.5e-3);
FILTER(6.5e-3);
// FILTER(6.0e-5);//6
// FILTER(7.0e-5);//8
// FILTER(8.0e-5);//9
// FILTER(9.0e-5);//3
/*
// FILTER(1.0e-4);//10
FILTER(2.0e-4);//11
// FILTER(3.0e-4);//12
// FILTER(4.0e-4);//13
FILTER(5.0e-4);//14
FILTER(6.0e-3);//4
FILTER(7.0e-4);//1
FILTER(8.0e-4);//7
FILTER(9.0e-4);//15
FILTER(1.0e-3);//2
FILTER(2.0e-3);//2
FILTER(3.0e-3);//2
FILTER(4.0e-3);//2
FILTER(5.0e-3);//2
FILTER(6.0e-3);//2
FILTER(7.0e-3);//2
FILTER(8.0e-3);//2
FILTER(1.0e-2);//2
*/
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
assert(b==nn);
}
#endif
}; };
@ -535,42 +279,34 @@ public:
CartesianStencil<siteVector,siteVector,int> Stencil; CartesianStencil<siteVector,siteVector,int> Stencil;
std::vector<CoarseMatrix> A; std::vector<CoarseMatrix> A;
/////////////////////// ///////////////////////
// Interface // Interface
/////////////////////// ///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
RealD M (const CoarseVector &in, CoarseVector &out){ void M (const CoarseVector &in, CoarseVector &out)
{
conformable(_grid,in.Grid()); conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid()); conformable(in.Grid(),out.Grid());
// RealD Nin = norm2(in);
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
double comms_usec = -usecond();
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
comms_usec += usecond(); autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
auto in_v = in.View();
auto out_v = out.View();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex; typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
GridStopWatch ArithmeticTimer;
int osites=Grid()->oSites(); int osites=Grid()->oSites();
// double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
// double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
double usecs =-usecond();
// assert(geom.npoint==9);
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis; int ss = sss/nbasis;
@ -580,41 +316,28 @@ public:
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
int lane=SIMTlane(Nsimd);
for(int point=0;point<geom.npoint;point++){ for(int point=0;point<geom.npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) { if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else { } else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
} }
synchronise(); acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) { for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb); res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
} }
} }
coalescedWrite(out_v[ss](b),res,lane); coalescedWrite(out_v[ss](b),res);
}); });
usecs +=usecond();
double nrm_usec=-usecond(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
RealD Nout= norm2(out);
nrm_usec+=usecond();
/*
std::cout << GridLogMessage << "\tNorm " << nrm_usec << " us" <<std::endl;
std::cout << GridLogMessage << "\tHalo " << comms_usec << " us" <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << usecs << " us" <<std::endl;
std::cout << GridLogMessage << "\t mflop/s " << flops/usecs<<std::endl;
std::cout << GridLogMessage << "\t MB/s " << bytes/usecs<<std::endl;
*/
return Nout;
}; };
RealD Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
{ {
if(hermitian) { if(hermitian) {
// corresponds to Petrov-Galerkin coarsening // corresponds to Petrov-Galerkin coarsening
@ -625,7 +348,6 @@ public:
G5C(tmp, in); G5C(tmp, in);
M(tmp, out); M(tmp, out);
G5C(out, out); G5C(out, out);
return norm2(out);
} }
}; };
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
@ -640,11 +362,11 @@ public:
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View()); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
auto out_v = out.View(); autoView( out_v , out, AcceleratorWrite);
auto in_v = in.View(); autoView( in_v , in, AcceleratorRead);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0])) calcVector;
@ -658,45 +380,21 @@ public:
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
int lane=SIMTlane(Nsimd);
SE=Stencil.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) { if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else { } else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
} }
synchronise(); acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) { for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb); res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
} }
coalescedWrite(out_v[ss](b),res,lane); coalescedWrite(out_v[ss](b),res);
}); });
#if 0 for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
accelerator_for(ss,Grid()->oSites(),1,{
siteVector res = Zero();
siteVector nbr;
int ptype;
StencilEntry *SE;
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in_v[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
synchronise();
res = res + Aview_p[point][ss]*nbr;
out_v[ss]=res;
});
#endif
} }
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{ {
@ -864,14 +562,12 @@ public:
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi); blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
auto iZProj_v = iZProj.View() ; autoView( iZProj_v , iZProj, AcceleratorRead) ;
auto oZProj_v = oZProj.View() ; autoView( oZProj_v , oZProj, AcceleratorRead) ;
auto A_p = A[p].View(); autoView( A_p , A[p], AcceleratorWrite);
auto A_self = A[self_stencil].View(); autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
// if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
// accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
} }
} }
@ -885,11 +581,11 @@ public:
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
{ {
auto tmp_ = tmp.View(); autoView( tmp_ , tmp, AcceleratorWrite);
auto evenmask_ = evenmask.View(); autoView( evenmask_ , evenmask, AcceleratorRead);
auto oddmask_ = oddmask.View(); autoView( oddmask_ , oddmask, AcceleratorRead);
auto Mphie_ = Mphie.View(); autoView( Mphie_ , Mphie, AcceleratorRead);
auto Mphio_ = Mphio.View(); autoView( Mphio_ , Mphio, AcceleratorRead);
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
}); });
@ -897,8 +593,8 @@ public:
blockProject(SelfProj,tmp,Subspace.subspace); blockProject(SelfProj,tmp,Subspace.subspace);
auto SelfProj_ = SelfProj.View(); autoView( SelfProj_ , SelfProj, AcceleratorRead);
auto A_self = A[self_stencil].View(); autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
for(int j=0;j<nbasis;j++){ for(int j=0;j<nbasis;j++){
@ -912,33 +608,8 @@ public:
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl; std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
ForceHermitian(); ForceHermitian();
} }
// AssertHermitian();
// ForceDiagonal();
} }
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< A[p] << std::endl;
}
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
void ForceHermitian(void) { void ForceHermitian(void) {
CoarseMatrix Diff (Grid()); CoarseMatrix Diff (Grid());
for(int p=0;p<geom.npoint;p++){ for(int p=0;p<geom.npoint;p++){
@ -958,27 +629,6 @@ public:
} }
} }
} }
void AssertHermitian(void) {
CoarseMatrix AA (Grid());
CoarseMatrix AAc (Grid());
CoarseMatrix Diff (Grid());
for(int d=0;d<4;d++){
int dd=d+1;
AAc = Cshift(A[2*d+1],dd,1);
AA = A[2*d];
Diff = AA - adj(AAc);
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
}
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
@ -191,7 +189,7 @@ public:
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View(); autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -232,15 +230,18 @@ public:
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
thread_for(idx, sgrid->lSites(),{ {
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd); Coordinate cbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf); sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf); peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L; cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L; pokeLocalSite(s,p_v,cbuf);
pokeLocalSite(s,pgbuf,cbuf); });
}); }
if (p != processors[dim] - 1) { if (p != processors[dim] - 1) {
result = Cshift(result,dim,L); result = Cshift(result,dim,L);
} }
@ -269,15 +270,19 @@ public:
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
thread_for(idx,sgrid->lSites(),{ {
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd); Coordinate clbuf(Nd), cgbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf); sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf_v,cgbuf);
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result_v,clbuf);
}); });
}
result = result*div; result = result*div;
// destroying plan // destroying plan

View File

@ -43,7 +43,6 @@ NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase { template<class Field> class LinearOperatorBase {
public: public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
@ -94,7 +93,10 @@ public:
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); _Mat.MdagM(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
_Mat.MdagM(in,out); _Mat.MdagM(in,out);
@ -131,17 +133,14 @@ public:
assert(0); assert(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2); HermOp(in,out);
out = out + _shift*in; ComplexD dot = innerProduct(in,out);
ComplexD dot;
dot= innerProduct(in,out);
n1=real(dot); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
RealD n1,n2; _Mat.MdagM(in,out);
HermOpAndNorm(in,out,n1,n2); out = out + _shift*in;
} }
}; };
@ -170,7 +169,7 @@ public:
_Mat.M(in,out); _Mat.M(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out); HermOp(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot); ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out); n2=norm2(out);
} }
@ -208,338 +207,305 @@ public:
} }
}; };
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several // Even Odd Schur decomp operators; there are several
// ways to introduce the even odd checkerboarding // ways to introduce the even odd checkerboarding
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
template<class Field> template<class Field>
class SchurOperatorBase : public LinearOperatorBase<Field> { class SchurOperatorBase : public LinearOperatorBase<Field> {
public: public:
virtual RealD Mpc (const Field &in, Field &out) =0; virtual void Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0; virtual void MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard(); tmp.Checkerboard() = in.Checkerboard();
ni=Mpc(in,tmp); Mpc(in,tmp);
no=MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out,n1,n2);
}
virtual void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
void Op (const Field &in, Field &out){
Mpc(in,out);
}
void AdjOp (const Field &in, Field &out){
MpcDag(in,out);
}
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0); // must coarsen the unpreconditioned system
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
};
template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
public:
Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard();
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
_Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeDag(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
};
template<class Matrix,class Field>
class SchurDiagOneOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
_Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp);
_Mat.Meooe(tmp,out);
_Mat.MooeeInv(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
};
template<class Matrix,class Field>
class SchurDiagTwoOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
_Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp);
_Mat.MeooeDag(tmp,out);
_Mat.MooeeInvDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
};
template<class Field>
class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
{
public:
virtual RealD Mpc (const Field& in, Field& out) = 0;
virtual RealD MpcDag (const Field& in, Field& out) = 0;
virtual void MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) {
Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard();
ni = Mpc(in,tmp);
no = MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
assert(0);
}
virtual void HermOp(const Field& in, Field& out) {
assert(0);
}
void Op(const Field& in, Field& out) {
Mpc(in, out);
}
void AdjOp(const Field& in, Field& out) {
MpcDag(in, out);
}
// Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) {
assert(0); // must coarsen the unpreconditioned system
}
void OpDir(const Field& in, Field& out, int dir, int disp) {
assert(0);
}
};
template<class Matrix, class Field>
class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase<Field>
{
public:
Matrix& _Mat;
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
virtual RealD Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard();
_Mat.Meooe(in, tmp);
_Mat.MooeeInv(tmp, out);
_Mat.Meooe(out, tmp);
_Mat.Mooee(in, out);
return axpy_norm(out, -1.0, tmp, out);
}
virtual RealD MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MeooeDag(in, tmp);
_Mat.MooeeInvDag(tmp, out);
_Mat.MeooeDag(out, tmp);
_Mat.MooeeDag(in, out);
return axpy_norm(out, -1.0, tmp, out);
}
};
template<class Matrix,class Field>
class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field>
{
protected:
Matrix &_Mat;
public:
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
virtual RealD Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.Meooe(in, out);
_Mat.MooeeInv(out, tmp);
_Mat.Meooe(tmp, out);
_Mat.MooeeInv(out, tmp);
return axpy_norm(out, -1.0, tmp, in);
}
virtual RealD MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MooeeInvDag(in, out);
_Mat.MeooeDag(out, tmp);
_Mat.MooeeInvDag(tmp, out);
_Mat.MeooeDag(out, tmp);
return axpy_norm(out, -1.0, tmp, in);
}
};
template<class Matrix, class Field>
class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field>
{
protected:
Matrix& _Mat;
public:
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
virtual RealD Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MooeeInv(in, out);
_Mat.Meooe(out, tmp);
_Mat.MooeeInv(tmp, out);
_Mat.Meooe(out, tmp);
return axpy_norm(out, -1.0, tmp, in);
}
virtual RealD MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MeooeDag(in, out);
_Mat.MooeeInvDag(out, tmp);
_Mat.MeooeDag(tmp, out);
_Mat.MooeeInvDag(out, tmp);
return axpy_norm(out, -1.0, tmp, in);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Staggered use
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
Field tmp;
RealD mass;
double tMpc;
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public:
void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ncall++; out.Checkerboard() = in.Checkerboard();
tMpc-=usecond(); MpcDagMpc(in,out);
n2 = Mpc(in,out); ComplexD dot= innerProduct(in,out);
tMpc+=usecond(); n1=real(dot);
tIP-=usecond(); n2=norm2(out);
ComplexD dot= innerProduct(in,out);
tIP+=usecond();
n1 = real(dot);
} }
virtual void HermOp(const Field &in, Field &out){ virtual void HermOp(const Field &in, Field &out){
ncall++; out.Checkerboard() = in.Checkerboard();
tMpc-=usecond(); MpcDagMpc(in,out);
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
} }
virtual RealD Mpc (const Field &in, Field &out) void Op (const Field &in, Field &out){
{ Mpc(in,out);
}
void AdjOp (const Field &in, Field &out){
MpcDag(in,out);
}
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0); // must coarsen the unpreconditioned system
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
};
};
template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
public:
Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard();
_Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
_Mat.Mooee(in,out);
axpy(out,-1.0,tmp,out);
}
virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeDag(in,out);
axpy(out,-1.0,tmp,out);
}
};
template<class Matrix,class Field>
class SchurDiagOneOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
_Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp);
_Mat.Meooe(tmp,out);
_Mat.MooeeInv(out,tmp);
axpy(out,-1.0,tmp,in);
}
virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
axpy(out,-1.0,tmp,in);
}
};
template<class Matrix,class Field>
class SchurDiagTwoOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
_Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
axpy(out,-1.0,tmp,in);
}
virtual void MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp);
_Mat.MeooeDag(tmp,out);
_Mat.MooeeInvDag(out,tmp);
axpy(out,-1.0,tmp,in);
}
};
template<class Field>
class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
{
public:
virtual void Mpc (const Field& in, Field& out) = 0;
virtual void MpcDag (const Field& in, Field& out) = 0;
virtual void MpcDagMpc(const Field& in, Field& out) {
Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard();
Mpc(in,tmp);
MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
assert(0);
}
virtual void HermOp(const Field& in, Field& out) {
assert(0);
}
void Op(const Field& in, Field& out) {
Mpc(in, out);
}
void AdjOp(const Field& in, Field& out) {
MpcDag(in, out);
}
// Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) {
assert(0); // must coarsen the unpreconditioned system
}
void OpDir(const Field& in, Field& out, int dir, int disp) {
assert(0);
}
void OpDirAll(const Field& in, std::vector<Field>& out){
assert(0);
};
};
template<class Matrix, class Field>
class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase<Field>
{
public:
Matrix& _Mat;
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard();
_Mat.Meooe(in, tmp);
_Mat.MooeeInv(tmp, out);
_Mat.Meooe(out, tmp);
_Mat.Mooee(in, out);
axpy(out, -1.0, tmp, out);
}
virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MeooeDag(in, tmp);
_Mat.MooeeInvDag(tmp, out);
_Mat.MeooeDag(out, tmp);
_Mat.MooeeDag(in, out);
axpy(out, -1.0, tmp, out);
}
};
template<class Matrix,class Field>
class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Field>
{
protected:
Matrix &_Mat;
public:
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.Meooe(in, out);
_Mat.MooeeInv(out, tmp);
_Mat.Meooe(tmp, out);
_Mat.MooeeInv(out, tmp);
axpy(out, -1.0, tmp, in);
}
virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MooeeInvDag(in, out);
_Mat.MeooeDag(out, tmp);
_Mat.MooeeInvDag(tmp, out);
_Mat.MeooeDag(out, tmp);
axpy(out, -1.0, tmp, in);
}
};
template<class Matrix, class Field>
class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Field>
{
protected:
Matrix& _Mat;
public:
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MooeeInv(in, out);
_Mat.Meooe(out, tmp);
_Mat.MooeeInv(tmp, out);
_Mat.Meooe(out, tmp);
axpy(out, -1.0, tmp, in);
}
virtual void MpcDag(const Field& in, Field& out) {
Field tmp(in.Grid());
_Mat.MeooeDag(in, out);
_Mat.MooeeInvDag(out, tmp);
_Mat.MeooeDag(tmp, out);
_Mat.MooeeInvDag(out, tmp);
axpy(out, -1.0, tmp, in);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Staggered use
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
Field tmp;
RealD mass;
public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
mass = _Mat.Mass();
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
Mpc(in,out);
ComplexD dot= innerProduct(in,out);
n1 = real(dot);
n2 =0.0;
}
virtual void HermOp(const Field &in, Field &out){
Mpc(in,out);
// _Mat.Meooe(in,out);
// _Mat.Meooe(out,tmp);
// axpby(out,-1.0,mass*mass,tmp,in);
}
virtual void Mpc (const Field &in, Field &out)
{
Field tmp(in.Grid()); Field tmp(in.Grid());
Field tmp2(in.Grid()); Field tmp2(in.Grid());
// _Mat.Mooee(in,out);
// _Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond();
_Mat.Meooe(in,out); _Mat.Meooe(in,out);
_Mat.Meooe(out,tmp); _Mat.Meooe(out,tmp);
tMeo+=usecond(); axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
} }
virtual RealD MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
return Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) { virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered assert(0);// Never need with staggered
@ -547,7 +513,6 @@ public:
}; };
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Base classes for functions of operators // Base classes for functions of operators
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////

View File

@ -38,16 +38,12 @@ template<class Field> class SparseMatrixBase {
public: public:
virtual GridBase *Grid(void) =0; virtual GridBase *Grid(void) =0;
// Full checkerboar operations // Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0; virtual void M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0; virtual void Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in.Grid());
ni=M(in,tmp);
no=Mdag(tmp,out);
}
virtual void MdagM(const Field &in, Field &out) { virtual void MdagM(const Field &in, Field &out) {
RealD ni, no; Field tmp (in.Grid());
MdagM(in,out,ni,no); M(in,tmp);
Mdag(tmp,out);
} }
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;

View File

@ -234,10 +234,8 @@ public:
GridBase *grid=in.Grid(); GridBase *grid=in.Grid();
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites(); int vol=grid->gSites();
typedef typename Field::vector_type vector_type;
Field T0(grid); T0 = in; Field T0(grid); T0 = in;
Field T1(grid); Field T1(grid);
@ -258,14 +256,28 @@ public:
// out = ()*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;
axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1); axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
for(int n=2;n<order;n++){ for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y); Linop.HermOp(*Tn,y);
// y=xscale*y+mscale*(*Tn); #if 0
// *Tnp=2.0*y-(*Tnm); auto y_v = y.View();
// out=out+Coeffs[n]* (*Tnp); auto Tn_v = Tn->View();
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
constexpr int Nsimd = vector_type::Nsimd();
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#else
axpby(y,xscale,mscale,y,(*Tn)); axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm)); axpby(*Tnp,2.0,-1.0,y,(*Tnm));
axpy(out,Coeffs[n],*Tnp,out); if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#endif
// Cycle pointers to avoid copies // Cycle pointers to avoid copies
Field *swizzle = Tnm; Field *swizzle = Tnm;
Tnm =Tn; Tnm =Tn;

View File

@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
LinearCombTimer.Start(); LinearCombTimer.Start();
bo = beta * omega; bo = beta * omega;
auto p_v = p.View(); {
auto r_v = r.View(); autoView( p_v , p, AcceleratorWrite);
auto v_v = v.View(); autoView( r_v , r, AcceleratorRead);
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ autoView( v_v , v, AcceleratorRead);
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
}); coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
alpha = rho / Calpha.real(); alpha = rho / Calpha.real();
LinearCombTimer.Start(); LinearCombTimer.Start();
auto h_v = h.View(); {
auto psi_v = psi.View(); autoView( p_v , p, AcceleratorRead);
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ autoView( r_v , r, AcceleratorRead);
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); autoView( v_v , v, AcceleratorRead);
}); autoView( psi_v,psi, AcceleratorRead);
autoView( h_v , h, AcceleratorWrite);
auto s_v = s.View(); autoView( s_v , s, AcceleratorWrite);
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
}); });
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@ -166,13 +172,19 @@ class BiCGSTAB : public OperatorFunction<Field>
omega = Comega.real() / norm2(t); omega = Comega.real() / norm2(t);
LinearCombTimer.Start(); LinearCombTimer.Start();
auto t_v = t.View(); {
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ autoView( psi_v,psi, AcceleratorWrite);
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); autoView( r_v , r, AcceleratorWrite);
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); autoView( h_v , h, AcceleratorRead);
}); autoView( s_v , s, AcceleratorRead);
autoView( t_v , t, AcceleratorRead);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
cp = norm2(r); cp = norm2(r);
LinalgTimer.Stop(); LinalgTimer.Stop();

View File

@ -140,13 +140,15 @@ public:
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
auto psi_v = psi.View(); {
auto p_v = p.View(); autoView( psi_v , psi, AcceleratorWrite);
auto r_v = r.View(); autoView( p_v , p, AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ autoView( r_v , r, AcceleratorWrite);
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
}); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();

View File

@ -37,211 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
Vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
#if 0
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
thread_region
{
vobj* B = Bt.data() + Nm * thread_num();
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
int nrot = j1-j0;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
// printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of Eigen matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
for(int k=0;k<Nm;++k){
for(int j=0;j<Nm;++j){
Qt_p[j*Nm+k]=Qt(j,k);
}
}
// Block the loop to keep storage footprint down
vobj zz=Zero();
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
auto z=coalescedRead(zz);
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.View();
Vector<View> basis_v(basis.size(),result_v);
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted lanczos // Implicitly restarted lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////

View File

@ -0,0 +1,241 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_NON_HERM_H
#define GRID_PREC_GCR_NON_HERM_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.Op(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
return;
}
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b, zAz;
RealD zAAz;
ComplexD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
LinalgTimer.Start();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -6,72 +6,6 @@ NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr; MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false; bool MemoryProfiler::debug = false;
#ifdef GRID_NVCC
#define SMALL_LIMIT (0)
#else
#define SMALL_LIMIT (4096)
#endif
#ifdef POINTER_CACHE
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < SMALL_LIMIT ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < SMALL_LIMIT ) return NULL;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
#endif
void check_huge_pages(void *Buf,uint64_t BYTES) void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__

View File

@ -26,118 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H #pragma once
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#ifdef POINTER_CACHE
class PointerCache {
private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
#ifdef GRID_NVCC
static const int Ncache=128;
#else
static const int Ncache=8;
#endif
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
#endif
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp> template<typename _Tp>
class alignedAllocator { class alignedAllocator {
public: public:
@ -161,70 +53,60 @@ public:
{ {
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr; return ptr;
} }
void deallocate(pointer __p, size_type __n) { void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes); profilerFree(bytes);
MemoryManager::CpuFree((void *)__p,bytes);
}
#ifdef POINTER_CACHE // FIXME: hack for the copy constructor, eventually it must be avoided
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
#else void construct(pointer __p, const _Tp& __val) { assert(0);};
pointer __freeme = __p; void construct(pointer __p) { };
#endif void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
#ifdef GRID_NVCC template<typename _Tp>
if ( __freeme ) cudaFree((void *)__freeme); class uvmAllocator {
#else public:
#ifdef HAVE_MM_MALLOC_H typedef std::size_t size_type;
if ( __freeme ) _mm_free((void *)__freeme); typedef std::ptrdiff_t difference_type;
#else typedef _Tp* pointer;
if ( __freeme ) free((void *)__freeme); typedef const _Tp* const_pointer;
#endif typedef _Tp& reference;
#endif typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
uvmAllocator() throw() { }
uvmAllocator(const uvmAllocator&) throw() { }
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
~uvmAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::SharedFree((void *)__p,bytes);
} }
// FIXME: hack for the copy constructor, eventually it must be avoided // FIXME: hack for the copy constructor, eventually it must be avoided
@ -233,17 +115,17 @@ public:
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>; template<class T> using commAllocator = uvmAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using commVector = std::vector<T,alignedAllocator<T> >; template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; //template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,4 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/MemoryManager.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@ -0,0 +1,244 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
}
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
}
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
}
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
}
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
}
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_host-=bytes;
}
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
total_host-=bytes;
}
}
#endif
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void MemoryManager::Init(void)
{
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
Ncache[Shared]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc;
}
}
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
#endif
#ifdef GRID_UVM
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,181 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryManager.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <list>
#include <unordered_map>
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Regular data
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class MemoryManager {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void PrintBytes(void);
public:
static void Init(void);
static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
static uint64_t DeviceBytes;
static uint64_t DeviceLRUBytes;
static uint64_t DeviceMaxBytes;
static uint64_t HostToDeviceBytes;
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
typedef std::list<uint64_t> LRU_t;
typedef typename LRU_t::iterator LRUiterator;
typedef struct {
int LRU_valid;
LRUiterator LRU_entry;
uint64_t CpuPtr;
uint64_t AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
static AccViewTable_t AccViewTable;
static LRU_t LRU;
/////////////////////////////////////////////////
// Device motion
/////////////////////////////////////////////////
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
static void Evict(AcceleratorViewEntry &AccCache);
static void Flush(AcceleratorViewEntry &AccCache);
static void Clone(AcceleratorViewEntry &AccCache);
static void AccDiscard(AcceleratorViewEntry &AccCache);
static void CpuDiscard(AcceleratorViewEntry &AccCache);
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table
static int EntryPresent(uint64_t CpuPtr);
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EntryErase (uint64_t CpuPtr);
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
static void AcceleratorViewClose(uint64_t AccPtr);
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
static void NotifyDeletion(void * CpuPtr);
public:
static void Print(void);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,468 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
MemoryManager::LRU_t MemoryManager::LRU;
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
/////////////////////////////////////////////////
// Mechanics of data table maintenance
/////////////////////////////////////////////////
int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty;
AccCache.LRU_valid=0;
AccCache.transient=0;
AccCache.accLock=0;
AccCache.cpuLock=0;
AccViewTable[CpuPtr] = AccCache;
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
{
auto AccCache = EntryLookup(CpuPtr);
AccViewTable.erase(CpuPtr);
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
} else {
LRU.push_front(AccCache.CpuPtr);
AccCache.LRU_entry = LRU.begin();
}
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
}
/////////////////////////////////////////////////
// Accelerator cache motion & consistency logic
/////////////////////////////////////////////////
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
AccCache.state=Consistent;
}
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
AccCache.state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return NULL;
}
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
}
}
}
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Cpu starts primary
if(mode==AcceleratorWriteDiscard){
CpuDiscard(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
LRUremove(AccCache);
}
int transient =hint;
AccCache.transient= transient? EvictNext : 0;
return AccCache.AccPtr;
}
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
LRUinsert(AccCache);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
AccCache.cpuLock--;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache.accLock= 0;
AccCache.cpuLock= 1;
} else if(AccCache.state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)
{
// Look up in ViewCache
uint64_t ptr = (uint64_t)_ptr;
if(EntryPresent(ptr)) {
auto e = EntryLookup(ptr);
AccDiscard(e->second);
}
}
void MemoryManager::Print(void)
{
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
return AccCache.cpuLock+AccCache.accLock;
} else {
return 0;
}
}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,24 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
#warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,67 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,95 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);

View File

@ -81,6 +81,7 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask;
public: public:

View File

@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@ -104,6 +105,7 @@ public:
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@ -114,6 +116,8 @@ public:
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
_checker_dim_mask[d]=0;
_fdimensions[d] = dimensions[d]; // Global dimensions _fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];

View File

@ -35,12 +35,28 @@ static const int CbRed =0;
static const int CbBlack=1; static const int CbBlack=1;
static const int Even =CbRed; static const int Even =CbRed;
static const int Odd =CbBlack; static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);
Lexicographic::CoorFromIndex(coor,oindex,rdim);
int linear=0;
for(int d=0;d<nd;d++){
if(chk_dim_msk[d])
linear=linear+coor[d];
}
return (linear&0x1);
}
// Specialise this for red black grids storing half the data like a chess board. // Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;

View File

@ -114,6 +114,7 @@ public:
void GlobalSumVector(RealD *,int N); void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &); void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &); void GlobalSum(uint64_t &);
void GlobalSumVector(uint64_t*,int N);
void GlobalSum(ComplexF &c); void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N); void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c); void GlobalSum(ComplexD &c);

View File

@ -275,6 +275,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){ void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);

View File

@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(double &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::GlobalSum(uint32_t &){} void CartesianCommunicator::GlobalSum(uint32_t &){}
void CartesianCommunicator::GlobalSum(uint64_t &){} void CartesianCommunicator::GlobalSum(uint64_t &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){} void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}

View File

@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
if (heap_bytes >= heap_size) { if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl; std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl; std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
assert(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
#ifdef GRID_NVCC #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif #endif
@ -170,17 +170,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
std::vector<int> primes({2,3,5}); std::vector<int> primes({2,3,5});
int dim = 0; int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1; int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) { while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) { int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p]; int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim]) if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) { && divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime; AutoShmSize*=prime;
ShmDims[dim]*=prime; ShmDims[dim]*=prime;
last_dim = dim;
break; break;
} }
} }
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension; dim=(dim+1) %ndimension;
} }
} }
@ -413,7 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC #ifdef GRID_CUDA
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -433,13 +440,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -677,7 +677,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
#ifdef GRID_NVCC #ifdef GRID_CUDA
cudaMemset(dest,0,bytes); cudaMemset(dest,0,bytes);
#else #else
bzero(dest,bytes); bzero(dest,bytes);
@ -685,7 +685,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{ {
#ifdef GRID_NVCC #ifdef GRID_CUDA
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault); cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else #else
bcopy(src,dest,bytes); bcopy(src,dest,bytes);

View File

@ -49,4 +49,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif #endif
NAMESPACE_BEGIN(Grid);
template<typename Op, typename T1>
auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2>
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2, class T3>
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))>
{
return Cshift(closure(expr),dim,shift);
}
NAMESPACE_END(Grid);
#endif #endif

View File

@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0; int ent = 0;
static Vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int bo = n*e2; int bo = n*e2;
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
} }
} }
} else { } else {
@ -65,14 +67,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int o = n*stride; int o = n*stride;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b); Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
thread_for(i,ent,{ {
buffer[table[i].first]=rhs_v[table[i].second]; autoView(rhs_v , rhs, AcceleratorRead);
}); auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
}
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@ -95,36 +102,38 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
thread_for_collapse(2,n,e1,{ autoView(rhs_v , rhs, AcceleratorRead);
for(int b=0;b<e2;b++){ accelerator_for2d(n,e1,b,e2,1,{
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} });
});
} else { } else {
autoView(rhs_v , rhs, AcceleratorRead);
// Case of SIMD split AND checker dim cannot currently be hit, except in Coordinate rdim=rhs.Grid()->_rdimensions;
// Test_cshift_red_black code. Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
thread_for_collapse(2,n,e1,{ accelerator_for2d(n,e1,b,e2,1,{
for(int b=0;b<e2;b++){
Coordinate coor;
int o=n*n1; int o=n*n1;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
} });
});
} }
} }
@ -145,7 +154,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent =0; int ent =0;
if ( cbmask ==0x3 ) { if ( cbmask ==0x3 ) {
@ -154,7 +164,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b); Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@ -165,16 +175,20 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++); Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
thread_for(i,ent,{ autoView( rhs_v, rhs, AcceleratorWrite);
rhs_v[table[i].first]=buffer[table[i].second]; auto buffer_p = & buffer[0];
}); auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -194,21 +208,19 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorWrite);
thread_for_collapse(2,n,e1,{ accelerator_for2d(n,e1,b,e2,1,{
for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
} });
});
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
@ -225,6 +237,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@ -239,14 +252,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} else { } else {
@ -255,23 +270,24 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); autoView(rhs_v , rhs, AcceleratorRead);
thread_for(i,ent,{ autoView(lhs_v , lhs, AcceleratorWrite);
lhs_v[table[i].first]=rhs_v[table[i].second]; auto table = &Cshift_table[0];
}); accelerator_for(i,ent,1,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
}
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@ -285,29 +301,33 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs.Grid()->_slice_block [dimension]; int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
thread_for(i,ent,{ autoView( lhs_v, lhs, AcceleratorWrite);
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); auto table = &Cshift_table[0];
}); accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////

View File

@ -0,0 +1,4 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
NAMESPACE_END(Grid);

View File

@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h> #include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h> #include <Grid/lattice/Lattice_ET.h>
@ -35,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_local.h> #include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h> #include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h> #include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h> //#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h> #include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h> #include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h> #include <Grid/lattice/Lattice_coordinate.h>
@ -43,4 +44,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_rng.h> #include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h> #include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h> #include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>

View File

@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp> Author: neo <cossu@post.kek.jp>
Author: Christoph Lehner <christoph@lhnr.de
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -91,12 +92,18 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{ {
return arg[ss]; return arg[ss];
} }
// What needs this?
// Cannot be legal on accelerator
// Comparison must convert
#if 1
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{ {
auto view = arg.View(); auto view = arg.View(AcceleratorRead);
return view[ss]; return view[ss];
} }
#endif
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand // handle nodes in syntax tree- eval one operand
@ -179,16 +186,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf
{
}
template <typename Op, typename T1> inline template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{ {
CBFromExpression(cb, expr.arg1); // recurse AST CBFromExpression(cb, expr.arg1); // recurse AST
} }
template <typename Op, typename T1, typename T2> inline template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{ {
@ -203,6 +206,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
CBFromExpression(cb, expr.arg3); // recurse AST CBFromExpression(cb, expr.arg3); // recurse AST
} }
//////////////////////////////////////////////////////////////////////////
// ViewOpen
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{
lat.ViewOpen(AcceleratorRead);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
ExpressionViewOpen(expr.arg3); // recurse AST
}
//////////////////////////////////////////////////////////////////////////
// ViewClose
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{
lat.ViewClose();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
ExpressionViewClose(expr.arg3); // recurse AST
}
//////////////////////////////////////////// ////////////////////////////////////////////
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////

View File

@ -7,6 +7,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@ -55,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs); mult(&tmp,&lhs_v(ss),&rhs);
@ -120,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -178,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.View(); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -220,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.View(); autoView( x_v , x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss); auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.View(); autoView( x_v , x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -28,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#define STREAMING_STORES #define STREAMING_STORES
@ -36,129 +38,6 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics. // The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code // This contains extra (host resident) grid pointer data that may be accessed by host code
@ -194,24 +73,33 @@ private:
dealloc(); dealloc();
this->_odata_size = size; this->_odata_size = size;
if ( size ) if ( size )
this->_odata = alloc.allocate(this->_odata_size); this->_odata = alloc.allocate(this->_odata_size);
else else
this->_odata = nullptr; this->_odata = nullptr;
} }
} }
public: public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
LatticeView<vobj> View (ViewMode mode) const
{ {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this)); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
return accessor; return accessor;
} }
~Lattice() { ~Lattice() {
if ( this->_odata_size ) { if ( this->_odata_size ) {
dealloc(); dealloc();
@ -231,12 +119,16 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@ -251,11 +143,15 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@ -269,11 +165,15 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
@ -324,10 +224,11 @@ public:
} }
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View(); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss] = r; me[ss]= r;
}); });
me.ViewClose();
return *this; return *this;
} }
@ -337,11 +238,12 @@ public:
/////////////////////////////////////////// ///////////////////////////////////////////
// user defined constructor // user defined constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode);
} }
// virtual ~Lattice(void) = default; // virtual ~Lattice(void) = default;
@ -357,7 +259,6 @@ public:
// copy constructor // copy constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(const Lattice& r){ Lattice(const Lattice& r){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid(); this->_grid = r.Grid();
resize(this->_grid->oSites()); resize(this->_grid->oSites());
*this = r; *this = r;
@ -380,11 +281,12 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = View(); auto me = View(AcceleratorWriteDiscard);
auto him= r.View(); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
@ -394,11 +296,12 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = View(); auto me = View(AcceleratorWriteDiscard);
auto him= r.View(); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
/////////////////////////////////////////// ///////////////////////////////////////////

View File

@ -0,0 +1,226 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_basis.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View;
Vector<View> basis_v; basis_v.reserve(basis.size());
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
View *basis_vp = &basis_v[0];
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
return;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{
int j = i/Nm;
int k = i%Nm;
Qt_p[i]=Qt(j,k);
});
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
decltype(coalescedRead(Bp[ss])) z;
z=Zero();
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View(AcceleratorRead)) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead));
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
NAMESPACE_END(Grid);

View File

@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
}); });
@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vPredicate> ret(lhs.Grid()); Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs); ret_v[ss]=op(lhs_v[ss],rhs);
}); });
@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]); ret_v[ss]=op(lhs,rhs_v[ss]);
}); });

View File

@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid(); GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
auto l_v = l.View(); autoView(l_v, l, CpuWrite);
thread_for( o, grid->oSites(), { thread_for( o, grid->oSites(), {
vector_type vI; vector_type vI;
Coordinate gcoor; Coordinate gcoor;
@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
}); });
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
}); });
@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
}); });
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr; typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),1,{ accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer // FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop // Use vector [] operator and don't read coalesce this loop

View File

@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto Y_v = Y.View(); autoView( Y_v , Y, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, CpuRead);
thread_region { thread_region {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);

View File

@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
}); });
return ret; return ret;
@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
}); });
return ret; return ret;
@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
}); });
} }
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
}); });
} }
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf); extract(l_v[odx],buf);
buf[idx] = s; buf[idx] = s;
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
extract(l_v[odx],buf); extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return; return;
}; };
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Must be CPU read view
template<class vobj,class sobj> template<class vobj,class sobj>
accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid = l.Grid(); GridBase *grid = l.getGrid();
assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@ -173,8 +173,7 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@ -183,18 +182,19 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
return; return;
}; };
// Must be CPU write view
template<class vobj,class sobj> template<class vobj,class sobj>
accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid=l.Grid(); GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@ -202,13 +202,11 @@ accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; vp[idx+w*Nsimd] = pt[w];
} }
return; return;
}; };

View File

@ -40,8 +40,11 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View(); autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss))); coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}); });
@ -50,8 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View(); autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
}); });

View File

@ -5,6 +5,7 @@
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or the Free Software Foundation; either version 2 of the License, or
@ -24,7 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h> #include <Grid/lattice/Lattice_reduction_gpu.h>
#endif #endif
@ -38,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
@ -62,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum; typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{ {
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
return sum_gpu(arg,osites); return sum_gpu(arg,osites);
#else #else
return sum_cpu(arg,osites); return sum_cpu(arg,osites);
#endif #endif
} }
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
auto arg_v = arg.View(); #if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites); auto ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
@ -93,55 +143,49 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
// Double inner product // Double inner product
template<class vobj> template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
ComplexD nrm; ComplexD nrm;
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.View();
auto right_v=right.View();
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC // Might make all code paths go this way.
// GPU - SIMT lane compliance... typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
accelerator_for( ss, sites, nsimd,{ // GPU - SIMT lane compliance...
auto x_l = left_v(ss); accelerator_for( ss, sites, 1,{
auto y_l = right_v(ss); auto x_l = left_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); auto y_l = right_v[ss];
}) inner_tmp_v[ss]=innerProductD(x_l,y_l);
});
}
// This is in single precision and fails some tests // This is in single precision and fails some tests
// Need a sumD that sums in double auto anrm = sum(inner_tmp_v,sites);
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites)); nrm = anrm;
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
ComplexD nrm = rankInnerProduct(left,right);
grid->GlobalSum(nrm);
return nrm;
}
///////////////////////// /////////////////////////
// Fast axpby_norm // Fast axpby_norm
// z = a x + b y // z = a x + b y
@ -167,45 +211,67 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
auto x_v=x.View();
auto y_v=y.View();
auto z_v=z.View();
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; autoView( x_v, x, AcceleratorRead);
Vector<inner_t> inner_tmp(sites); autoView( y_v, y, AcceleratorRead);
auto inner_tmp_v = &inner_tmp[0]; autoView( z_v, z, AcceleratorWrite);
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v[ss]+b*y_v[ss];
inner_tmp_v[ss]=innerProductD(tmp,tmp); inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp; z_v[ss]=tmp;
}); });
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites))); nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class vobj> strong_inline void
innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
{
conformable(left,right);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2);
GridBase *grid = left.Grid();
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
// GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
{
autoView(left_v,left, AcceleratorRead);
autoView(right_v,right,AcceleratorRead);
accelerator_for( ss, sites, 1,{
auto left_tmp = left_v[ss];
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
});
}
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0];
nrm = real(tmp[1]);
}
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
@ -271,7 +337,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
auto Data_v=Data.View(); autoView( Data_v, Data, CpuRead);
thread_for( r,rd, { thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
@ -349,8 +415,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View(); autoView( lhv, lhs, CpuRead);
auto rhv=rhs.View(); autoView( rhv, rhs, CpuRead);
thread_for( r,rd,{ thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@ -457,14 +523,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
auto Rv=R.View(); autoView( Rv, R, CpuWrite);
auto Xv=X.View(); autoView( Xv, X, CpuRead);
auto Yv=Y.View(); autoView( Yv, Y, CpuRead);
thread_for_collapse(2, n, e1, { thread_for2d( n, e1, b,e2, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
}
}); });
} }
}; };
@ -517,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View(); autoView( X_v, X, CpuRead);
auto Y_v=Y.View(); autoView( Y_v, Y, CpuRead);
auto R_v=R.View(); autoView( R_v, R, CpuWrite);
thread_region thread_region
{ {
Vector<vobj> s_x(Nblock); Vector<vobj> s_x(Nblock);
@ -564,13 +628,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog]; int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View(); autoView( R_v, R, CpuWrite);
auto X_v = X.View(); autoView( X_v, X, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@ -628,8 +693,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v=rhs.View(); autoView( rhs_v, rhs, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);

View File

@ -1,7 +1,13 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32 #ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props;
#endif
#ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props; extern cudaDeviceProp *gpu_props;
#endif
#define WARP_SIZE 32
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@ -19,7 +25,12 @@ template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device; int device;
#ifdef GRID_CUDA
cudaGetDevice(&device); cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize; Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
sobj *smem = (sobj *)shmem_pointer; sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished // wait until all outstanding memory instructions in this thread are finished
__threadfence(); acceleratorFence();
if (tid==0) { if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x); unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
@ -156,8 +167,8 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
} }
// each thread must read the correct value of amLast // each thread must read the correct value of amLast
__syncthreads(); acceleratorSynchroniseAll();
if (amLast) { if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1] // reduce buffer[0], ..., buffer[gridDim.x-1]
Iterator i = tid; Iterator i = tid;
@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize(); accelerator_barrier();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0]; auto result = buffer_v[0];
return result; return result;
} }

View File

@ -375,7 +375,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View(); autoView(l_v, l, CpuWrite);
thread_for( ss, osites, { thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd); ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
@ -461,8 +461,8 @@ public:
} }
{ {
// Obtain one reseeded generator per thread // Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads(); int Nthread = 32; // Hardwire a good level or parallelism
std::vector<RngEngine> seeders(Nthread); std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){ for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine); seeders[t] = Reseed(master_engine);

View File

@ -37,17 +37,19 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace // Trace
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
/*
template<class vobj> template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView(ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView(lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss))); coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
}); });
return ret; return ret;
}; };
*/
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation // Trace Index level dependent operation
@ -56,8 +58,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
}); });

View File

@ -6,6 +6,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -46,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{
half.Checkerboard() = cb; half.Checkerboard() = cb;
auto half_v = half.View(); autoView( half_v, half, CpuWrite);
auto full_v = full.View(); autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{ thread_for(ss, full.Grid()->oSites(),{
int cbos; int cbos;
Coordinate coor; Coordinate coor;
@ -63,10 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
} }
}); });
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
{
int cb = half.Checkerboard(); int cb = half.Checkerboard();
auto half_v = half.View(); autoView( half_v , half, CpuRead);
auto full_v = full.View(); autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{ thread_for(ss,full.Grid()->oSites(),{
Coordinate coor; Coordinate coor;
@ -81,96 +84,138 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
} }
}); });
} }
template<class vobj,class CComplex,int nbasis>
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis)
{
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
Lattice<CComplex> ip(coarse); ////////////////////////////////////////////////////////////////////////////////////////////
// Flexible Type Conversion for internal promotion to double as well as graceful
// auto fineData_ = fineData.View(); // treatment of scalar-compatible types
auto coarseData_ = coarseData.View(); ////////////////////////////////////////////////////////////////////////////////////////////
auto ip_ = ip.View(); accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
for(int v=0;v<nbasis;v++) { out = in;
blockInnerProduct(ip,Basis[v],fineData);
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
coalescedWrite(coarseData_[sc](v),ip_(sc));
});
}
} }
template<class vobj,class CComplex,int nbasis> accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData, out = in;
const Lattice<vobj> &fineData, }
const std::vector<Lattice<vobj> > &Basis)
{
typedef iVector<CComplex,nbasis > coarseSiteData;
coarseSiteData elide;
typedef decltype(coalescedRead(elide)) ScalarComplex;
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension;
// checks #ifdef GRID_SIMT
assert( nbasis == Basis.size() ); accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
subdivides(coarse,fine); ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
for(int i=0;i<nbasis;i++){ }
conformable(Basis[i],fineData); accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
} ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
}
#endif
Coordinate block_r (_ndimension); accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
for(int d=0 ; d<_ndimension;d++){ }
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
}
int blockVol = fine->oSites()/coarse->oSites();
coarseData=Zero(); accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
}
auto fineData_ = fineData.View(); template<typename T1,typename T2,int N>
auto coarseData_ = coarseData.View(); accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
//////////////////////////////////////////////////////////////////////////////////////////////////////// template<typename T1,typename T2,int N>
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse. accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
// Otherwise do fine inner product per site, and make the update atomic
////////////////////////////////////////////////////////////////////////////////////////////////////////
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
auto sc=sci/nbasis; template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
auto i=sci%nbasis; accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
auto Basis_ = Basis[i].View(); convertType(out,in._internal);
}
Coordinate coor_c(_ndimension); template<typename T1,typename T2>
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in);
}
int sf; template<typename T1,typename T2,int N>
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero(); accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
for (int i=0;i<N;i++)
for (int j=0;j<N;j++)
convertType(out._internal[i][j],in._internal[i][j]);
}
for(int sb=0;sb<blockVol;sb++){ template<typename T1,typename T2,int N>
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
for (int i=0;i<N;i++)
convertType(out._internal[i],in._internal[i]);
}
Coordinate coor_b(_ndimension); template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
Coordinate coor_f(_ndimension); accelerator_inline void convertType(T & out, const T & in) {
out = in;
}
Lexicographic::CoorFromIndex(coor_b,sb,block_r); template<typename T1,typename T2>
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d]; accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions); autoView( out_v , out,AcceleratorWrite);
autoView( in_v , in ,AcceleratorRead);
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf)); accelerator_for(ss,out_v.size(),T1::Nsimd(),{
} convertType(out_v[ss],in_v(ss));
coalescedWrite(coarseData_[sc](i),reduce);
}); });
return;
} }
template<class vobj,class CComplex> ////////////////////////////////////////////////////////////////////////////////////////////
inline void blockZAXPY(Lattice<vobj> &fineZ, // precision-promoted local inner product
const Lattice<CComplex> &coarseA, ////////////////////////////////////////////////////////////////////////////////////////////
const Lattice<vobj> &fineX, template<class vobj>
const Lattice<vobj> &fineY) inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
{
autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid());
{
autoView(ret_v, ret,AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
});
}
return ret;
}
////////////////////////////////////////////////////////////////////////////////////////////
// block routines
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const VLattice &Basis)
{
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData;
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( ip_ , ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
}
}
template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
const Lattice<CComplex> &coarseA,
const Lattice<vobj2> &fineX,
const Lattice<vobj> &fineY)
{ {
GridBase * fine = fineZ.Grid(); GridBase * fine = fineZ.Grid();
GridBase * coarse= coarseA.Grid(); GridBase * coarse= coarseA.Grid();
@ -182,7 +227,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
conformable(fineX,fineZ); conformable(fineX,fineZ);
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
// FIXME merge with subdivide checking routine as this is redundant // FIXME merge with subdivide checking routine as this is redundant
@ -191,29 +236,66 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
auto fineZ_ = fineZ.View(); autoView( fineZ_ , fineZ, AcceleratorWrite);
auto fineX_ = fineX.View(); autoView( fineX_ , fineX, AcceleratorRead);
auto fineY_ = fineY.View(); autoView( fineY_ , fineY, AcceleratorRead);
auto coarseA_= coarseA.View(); autoView( coarseA_, coarseA, AcceleratorRead);
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
int sc;
Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); int sc;
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; Coordinate coor_c(_ndimension);
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Coordinate coor_f(_ndimension);
// z = A x + y Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
}); // z = A x + y
#ifdef GRID_SIMT
typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx;
#else
typename vobj2::tensor_reduced cA;
vobj cAx;
#endif
convertType(cA,TensorRemove(coarseA_(sc)));
auto prod = cA*fineX_(sf);
convertType(cAx,prod);
coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
});
return; return;
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY)
{
typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
GridBase *coarse(CoarseInner.Grid());
GridBase *fine (fineX.Grid());
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse);
// Precision promotion
fine_inner = localInnerProductD<vobj>(fineX,fineY);
blockSum(coarse_inner,fine_inner);
{
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
});
}
}
template<class vobj,class CComplex> // deprecate
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner, inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX, const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
@ -227,15 +309,17 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.View();
auto coarse_inner_ = coarse_inner.View();
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
accelerator_for(ss, coarse->oSites(), 1, { {
CoarseInner_[ss] = coarse_inner_[ss]; autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
}); autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss];
});
}
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
{ {
@ -248,7 +332,7 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
// useful in multigrid project; // useful in multigrid project;
// Generic name : Coarsen? // Generic name : Coarsen?
template<class vobj> template<class vobj>
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
@ -256,9 +340,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
@ -266,32 +350,33 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
// Turn this around to loop threaded over sc and interior loop // Turn this around to loop threaded over sc and interior loop
// over sf would thread better // over sf would thread better
auto coarseData_ = coarseData.View(); autoView( coarseData_ , coarseData, AcceleratorWrite);
auto fineData_ = fineData.View(); autoView( fineData_ , fineData, AcceleratorRead);
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
coarseData_[sc]=Zero(); coarseData_[sc]=Zero();
for(int sb=0;sb<blockVol;sb++){ for(int sb=0;sb<blockVol;sb++){
int sf;
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
coarseData_[sc]=coarseData_[sc]+fineData_[sf]; int sf;
} Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
}); coarseData_[sc]=coarseData_[sc]+fineData_[sf];
}
});
return; return;
} }
template<class vobj> template<class vobj>
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor) inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
{ {
@ -313,8 +398,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
} }
} }
template<class vobj,class CComplex> template<class CComplex,class VLattice>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
{ {
GridBase *coarse = ip.Grid(); GridBase *coarse = ip.Grid();
GridBase *fine = Basis[0].Grid(); GridBase *fine = Basis[0].Grid();
@ -322,23 +407,30 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
int nbasis = Basis.size() ; int nbasis = Basis.size() ;
// checks // checks
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i].Grid(),fine); conformable(Basis[i].Grid(),fine);
} }
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
for(int u=0;u<v;u++) { for(int u=0;u<v;u++) {
//Inner product & remove component //Inner product & remove component
blockInnerProduct(ip,Basis[u],Basis[v]); blockInnerProductD(ip,Basis[u],Basis[v]);
ip = -ip; ip = -ip;
blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]); blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
} }
blockNormalise(ip,Basis[v]); blockNormalise(ip,Basis[v]);
} }
} }
template<class vobj,class CComplex>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
{
blockOrthonormalize(ip,Basis);
}
#if 0 #if 0
// TODO: CPU optimized version here
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
@ -360,8 +452,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
auto fineData_ = fineData.View(); autoView( fineData_ , fineData, AcceleratorWrite);
auto coarseData_ = coarseData.View(); autoView( coarseData_ , coarseData, AcceleratorRead);
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{ accelerator_for(sf,fine->oSites(),1,{
@ -374,7 +466,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View(); /* auto basis_ = Basis[i], );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]); if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]); else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
} }
@ -383,24 +475,25 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
#else #else
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const VLattice &Basis)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
fineData=Zero(); fineData=Zero();
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i); Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
Lattice<CComplex> cip(coarse);
auto cip_ = cip.View(); //Lattice<CComplex> cip(coarse);
auto ip_ = ip.View(); //autoView( cip_ , cip, AcceleratorWrite);
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{ //autoView( ip_ , ip, AcceleratorRead);
coalescedWrite(cip_[sc], ip_(sc)()); //accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
}); // coalescedWrite(cip_[sc], ip_(sc)());
blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData); // });
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
blockZAXPY(fineData,ip,Basis[i],fineData);
} }
} }
#endif #endif
@ -427,15 +520,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead);
autoView(out_v,out,CpuWrite);
thread_for(idx, ig->lSites(),{ thread_for(idx, ig->lSites(),{
sobj s; sobj s;
ssobj ss; ssobj ss;
Coordinate lcoor(ni); Coordinate lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor); ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in,lcoor); peekLocalSite(s,in_v,lcoor);
ss=s; ss=s;
pokeLocalSite(ss,out,lcoor); pokeLocalSite(ss,out_v,lcoor);
}); });
} }
@ -470,8 +565,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions; Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
auto t_v = To.View();
auto f_v = From.View(); autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{ accelerator_for(idx,Fg->lSites(),1,{
sobj s; sobj s;
Coordinate Fcoor(nd); Coordinate Fcoor(nd);
@ -494,8 +590,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
} }
// peekLocalSite(s,From,Fcoor);
// pokeLocalSite(s,To ,Tcoor);
} }
}); });
} }
@ -526,6 +620,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -538,8 +634,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
}); });
} }
@ -567,6 +663,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -579,8 +677,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
}); });
} }
@ -608,6 +706,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -616,8 +716,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
} }
}); });
} }
@ -645,6 +745,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -653,8 +755,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
} }
}); });
} }
@ -718,7 +820,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
} }
//loop over outer index //loop over outer index
auto in_v = in.View(); autoView( in_v , in, CpuRead);
thread_for(in_oidx,in_grid->oSites(),{ thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd); ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@ -811,7 +913,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for(oidx, grid->oSites(),{ thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd); ExtractPointerArray<sobj> ptrs(nsimd);
@ -914,7 +1016,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
std::vector<SobjOut> in_slex_conv(in_grid->lSites()); std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in); unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{ thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim); Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx); out_grid->oCoorFromOindex(out_ocoor, out_oidx);

View File

@ -38,17 +38,19 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose // Transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
/*
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
}); });
return ret; return ret;
}; };
*/
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose // Index level dependent transpose
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -56,8 +58,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
}); });

View File

@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs, rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret, ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{ accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y); ret[ss]=pow(rhs[ss],y);
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y)); coalescedWrite(ret[ss],mod(rhs(ss),y));
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
ret.Checkerboard() = rhs_i.Checkerboard(); ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y)); coalescedWrite(ret[ss],div(rhs(ss),y));
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));

168
Grid/lattice/Lattice_view.h Normal file
View File

@ -0,0 +1,168 @@
#pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
//public:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
ViewAdvise advise;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline ViewAdvise Advise(void) const { return advise; };
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
// Host only
GridBase * getGrid(void) const { return _grid; };
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
ViewMode mode;
void * cpu_ptr;
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
return coalescedRead(this->_odata[i]);
}
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
{
this->ViewOpen(mode);
}
// Host functions
void ViewOpen(ViewMode mode)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
this->cpu_ptr = (void *)this->_odata;
this->mode = mode;
this->_odata =(vobj *)
MemoryManager::ViewOpen(this->cpu_ptr,
this->_odata_size*sizeof(vobj),
mode,
this->advise);
}
void ViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
ViewCloser(View &_v) : v(_v) {};
~ViewCloser() { v.ViewClose(); }
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
NAMESPACE_END(Grid);

View File

@ -341,7 +341,7 @@ class BinaryIO {
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
assert(ieee64||ieee32|ieee64big||ieee32big); assert(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1); assert((ieee64+ieee32+ieee64big+ieee32big)==1);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////

View File

@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
}; };
}; };
template<class fobj,class sobj>
struct GaugeDoubleStoredMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeDoubleStoredUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj> template<class fobj,class sobj>
struct Gauge3x2munger{ struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){ void operator() (fobj &in,sobj &out){

View File

@ -146,7 +146,7 @@ public:
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger; // depending on datatype, set up munger;

224
Grid/parallelIO/OpenQcdIO.h Normal file
View File

@ -0,0 +1,224 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIO.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
struct OpenQcdHeader : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
int, Nt,
int, Nx,
int, Ny,
int, Nz,
double, plaq);
};
class OpenQcdIO : public BinaryIO {
public:
static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
OpenQcdHeader header;
{
std::ifstream fin(file, std::ios::in | std::ios::binary);
fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
assert(!fin.fail());
field.data_start = fin.tellg();
fin.close();
}
header.plaq /= normalisationFactor;
// sanity check (should trigger on endian issues)
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
std::cout << GridLogDebug << "header: " << header << std::endl;
std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
assert(grid->_ndimension == Nd);
for(int d = 0; d < Nd; d++)
assert(grid->_fdimensions[d] == field.dimension[d]);
field.plaquette = header.plaq;
return field.data_start;
}
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
assert(grid != nullptr); assert(grid->_ndimension == Nd);
uint64_t offset = readHeader(file, Umu.Grid(), header);
FieldMetaData clone(header);
std::string format("IEEE64"); // they always store little endian double precsision
uint32_t nersc_csum, scidac_csuma, scidac_csumb;
GridCartesian* grid_openqcd = createOpenQcdGrid(grid);
GridRedBlackCartesian* grid_rb = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
typedef DoubleStoredColourMatrixD fobj;
typedef typename DoubleStoredGaugeField::vector_object::scalar_object sobj;
typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
word w = 0;
std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
std::vector<sobj> scalardata(grid->lSites());
IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
nersc_csum, scidac_csuma, scidac_csumb);
GridStopWatch timer;
timer.Start();
DoubleStoredGaugeField Umu_ds(grid);
auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
Coordinate ldim = grid->LocalDimensions();
thread_for(idx_g, grid->lSites(), {
Coordinate coor;
grid->LocalIndexToLocalCoor(idx_g, coor);
bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
if(!isOdd) continue;
int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Ydir] * ldim[Zdir]
+ coor[Zdir])/2;
munge(iodata[idx_o], scalardata[idx_g]);
});
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
vectorizeFromLexOrdArray(scalardata, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
undoDoubleStore(Umu, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
std::string file) {
std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
exit(EXIT_FAILURE);
}
private:
static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
// exploit GridCartesian to be able to still use IOobject
Coordinate gdim = grid->GlobalDimensions();
Coordinate ldim = grid->LocalDimensions();
Coordinate pcoor = grid->ThisProcessorCoor();
// openqcd does rb on the z direction
gdim[Zdir] /= 2;
ldim[Zdir] /= 2;
// and has the order T X Y Z (from slowest to fastest)
std::swap(gdim[Xdir], gdim[Zdir]);
std::swap(ldim[Xdir], ldim[Zdir]);
std::swap(pcoor[Xdir], pcoor[Zdir]);
GridCartesian* ret = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
ret->_ldimensions = ldim;
ret->_processor_coor = pcoor;
return ret;
}
template<class vsimd>
static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
// they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
for(int mu_g = 0; mu_g < Nd; ++mu_g) {
int mu_o = (mu_g + 1) % Nd;
U = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
+ Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
PokeIndex<LorentzIndex>(Umu, U, mu_g);
}
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,281 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <ios>
#include <iostream>
#include <limits>
#include <iomanip>
#include <mpi.h>
#include <ostream>
#include <string>
#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
#define CHECK_VAR(a) { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
// #undef CHECK
// #define CHECK
NAMESPACE_BEGIN(Grid);
class ParRdr {
private:
bool const swap;
MPI_Status status;
MPI_File fp;
int err;
MPI_Datatype oddSiteType;
MPI_Datatype fileViewType;
GridBase* grid;
public:
ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
: swap(false)
, grid(gridPtr) {
err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
assert(err == MPI_SUCCESS);
}
virtual ~ParRdr() { MPI_File_close(&fp); }
inline void errInfo(int const err, std::string const& func) {
static char estring[MPI_MAX_ERROR_STRING];
int eclass = -1, len = 0;
MPI_Error_class(err, &eclass);
MPI_Error_string(err, estring, &len);
std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
}
int readHeader(FieldMetaData& field) {
assert((grid->_ndimension == Nd) && (Nd == 4));
assert(Nc == 3);
OpenQcdHeader header;
readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
header.plaq /= 3.; // TODO change this into normalizationfactor
// sanity check (should trigger on endian issues) TODO remove?
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
for(int d = 0; d < Nd; d++)
assert(grid->FullDimensions()[d] == field.dimension[d]);
field.plaquette = header.plaq;
field.data_start = sizeof(OpenQcdHeader);
return field.data_start;
}
void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
errInfo(err, "MPI_File_read_at_all");
// CHECK_VAR(err)
int read = -1;
MPI_Get_count(&status, datatype, &read);
// CHECK_VAR(read)
assert(nbytes == (uint64_t)read);
assert(err == MPI_SUCCESS);
}
void createTypes() {
constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
Coordinate const L = grid->GlobalDimensions();
Coordinate const l = grid->LocalDimensions();
Coordinate const i = grid->ThisProcessorCoor();
Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
}
void freeTypes() {
err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
}
bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
auto hdr_offset = readHeader(meta);
CHECK
createTypes();
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
CHECK
int const domainSites = grid->lSites();
domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
// the actual READ
constexpr uint64_t cm_size = 2 * Nc * Nc * sizeof(double); // 2_complex
constexpr uint64_t os_size = Nd * 2 * cm_size; // 2_fwdbwd
constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
uint64_t const n_os = domainSites / 2;
for(uint64_t os_idx = 0; os_idx < n_os;) {
uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
uint64_t const cm = os_idx * Nd * 2;
readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
os_idx += read_os;
}
CHECK
err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
errInfo(err, "MPI_File_set_view1");
assert(err == MPI_SUCCESS);
freeTypes();
std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
return true;
}
};
class OpenQcdIOChromaReference : public BinaryIO {
public:
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Grid::FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = Umu.Grid();
typedef ColourMatrixD fobj;
std::vector<fobj> iodata(
Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
{
ParRdr rdr(MPI_COMM_WORLD, file, grid);
rdr.readGauge(iodata, header);
} // equivalent to using binaryio
std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
DoubledGaugeField Umu_ds(grid);
vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
FieldMetaData clone(header);
GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
private:
template<class vsimd>
static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
Grid::conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
}
static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
std::vector<ColourMatrixD> const& node_buff,
GridBase* grid) {
assert(node_buff.size() == Nd * grid->lSites());
Coordinate const& l = grid->LocalDimensions();
Coordinate coord(Nd);
int& x = coord[0];
int& y = coord[1];
int& z = coord[2];
int& t = coord[3];
int buff_idx = 0;
for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
for(x = 0; x < l[0]; ++x)
for(y = 0; y < l[1]; ++y)
for(z = 0; z < l[2]; ++z) {
if((t + z + y + x) % 2 == 0) continue;
int local_idx;
Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
for(int mu = 0; mu < 2 * Nd; ++mu)
for(int c1 = 0; c1 < Nc; ++c1) {
for(int c2 = 0; c2 < Nc; ++c2) {
u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
}
}
buff_idx += 2 * Nd;
}
assert(node_buff.size() == buff_idx);
}
};
NAMESPACE_END(Grid);

View File

@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
#ifdef GRID_NVCC #ifdef GRID_CUDA
accelerator_inline uint64_t __rdtsc(void) { return 0; } accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; } accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else #else
@ -95,7 +95,8 @@ inline uint64_t cyclecount(void){
} }
#elif defined __x86_64__ #elif defined __x86_64__
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); uint64_t ret = __rdtsc();
return (uint64_t)ret;
} }
#else #else
@ -111,7 +112,6 @@ class PerformanceCounter {
private: private:
typedef struct { typedef struct {
public:
uint32_t type; uint32_t type;
uint64_t config; uint64_t config;
const char *name; const char *name;

View File

@ -110,15 +110,15 @@ public:
#endif #endif
accumulator = std::chrono::duration_cast<GridUsecs>(start-start); accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
} }
GridTime Elapsed(void) { GridTime Elapsed(void) const {
assert(running == false); assert(running == false);
return std::chrono::duration_cast<GridTime>( accumulator ); return std::chrono::duration_cast<GridTime>( accumulator );
} }
uint64_t useconds(void){ uint64_t useconds(void) const {
assert(running == false); assert(running == false);
return (uint64_t) accumulator.count(); return (uint64_t) accumulator.count();
} }
bool isRunning(void){ bool isRunning(void) const {
return running; return running;
} }
}; };

View File

@ -12773,7 +12773,7 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC #ifdef GRID_CUDA
#pragma pop #pragma pop
#endif #endif

View File

@ -133,23 +133,23 @@ typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF; typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD; typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<ComplexF > SpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
// LorentzColour // LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix; typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
@ -443,16 +443,16 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
////////////////////////////////////////////// //////////////////////////////////////////////
// Fermion <-> propagator assignements // Fermion <-> propagator assignements
////////////////////////////////////////////// //////////////////////////////////////////////
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
template <class Fimpl> template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c) void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{ {
for(int j = 0; j < Ns; ++j) for(int j = 0; j < Ns; ++j)
{ {
auto pjs = peekSpin(p, j, s); auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j); auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i) for(int i = 0; i < Fimpl::Dimension; ++i)
{ {
pokeColour(pjs, peekColour(fj, i), i, c); pokeColour(pjs, peekColour(fj, i), i, c);
} }
@ -460,16 +460,16 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
} }
} }
//template <class Prop, class Ferm> //template <class Prop, class Ferm>
template <class Fimpl> template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{ {
for(int j = 0; j < Ns; ++j) for(int j = 0; j < Ns; ++j)
{ {
auto pjs = peekSpin(p, j, s); auto pjs = peekSpin(p, j, s);
auto fj = peekSpin(f, j); auto fj = peekSpin(f, j);
for(int i = 0; i < Fimpl::Dimension; ++i) for(int i = 0; i < Fimpl::Dimension; ++i)
{ {
pokeColour(fj, peekColour(pjs, i, c), i); pokeColour(fj, peekColour(pjs, i, c), i);
} }

View File

@ -40,8 +40,8 @@ public:
public: public:
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operations // half checkerboard operations
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);
@ -141,7 +141,33 @@ public:
Vector<iSinglet<Simd> > MatpInvDag; Vector<iSinglet<Simd> > MatpInvDag;
Vector<iSinglet<Simd> > MatmInvDag; Vector<iSinglet<Simd> > MatmInvDag;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
// Virtual can't template
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
void ContractJ5q(FermionField &q_in,ComplexField &J5q);
///////////////////////////////////////////////////////////////
// Constructors // Constructors
///////////////////////////////////////////////////////////////
CayleyFermion5D(GaugeField &_Umu, CayleyFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid, GridRedBlackCartesian &FiveDimRedBlackGrid,

View File

@ -41,8 +41,8 @@ public:
public: public:
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);

View File

@ -53,8 +53,8 @@ public:
virtual void DtildeInv (const FermionField& in, FermionField& out); virtual void DtildeInv (const FermionField& in, FermionField& out);
// override multiply // override multiply
virtual RealD M (const FermionField& in, FermionField& out); virtual void M (const FermionField& in, FermionField& out);
virtual RealD Mdag (const FermionField& in, FermionField& out); virtual void Mdag (const FermionField& in, FermionField& out);
// half checkerboard operations // half checkerboard operations
virtual void Mooee (const FermionField& in, FermionField& out); virtual void Mooee (const FermionField& in, FermionField& out);

View File

@ -114,19 +114,22 @@ public:
U = adj(Cshift(U, mu, -1)); U = adj(Cshift(U, mu, -1));
PokeIndex<LorentzIndex>(Uadj, U, mu); PokeIndex<LorentzIndex>(Uadj, U, mu);
} }
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { autoView(Umu_v,Umu,CpuRead);
autoView(Uadj_v,Uadj,CpuRead);
autoView(Uds_v,Uds,CpuWrite);
thread_for( lidx, GaugeGrid->lSites(), {
Coordinate lcoor; Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu, lcoor); peekLocalSite(ScalarUmu, Umu_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj, lcoor); peekLocalSite(ScalarUmu, Uadj_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds, lcoor); pokeLocalSite(ScalarUds, Uds_v, lcoor);
} });
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)

View File

@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types #include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D); NAMESPACE_CHECK(Wilson5D);
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
NAMESPACE_CHECK(Staggered); NAMESPACE_CHECK(Staggered);
@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_NVCC #ifndef GRID_CUDA
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR; typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF; typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD; typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;

View File

@ -58,8 +58,8 @@ public:
virtual GridBase *GaugeRedBlackGrid(void) =0; virtual GridBase *GaugeRedBlackGrid(void) =0;
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out)=0; virtual void M (const FermionField &in, FermionField &out)=0;
virtual RealD Mdag (const FermionField &in, FermionField &out)=0; virtual void Mdag (const FermionField &in, FermionField &out)=0;
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out)=0; virtual void Meooe (const FermionField &in, FermionField &out)=0;
@ -86,15 +86,14 @@ public:
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0; virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0; virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);}; virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist)
{ {
FFT theFFT((GridCartesian *) in.Grid()); FFT theFFT((GridCartesian *) in.Grid());
@ -148,15 +147,19 @@ public:
virtual void ContractConservedCurrent(PropagatorField &q_in_1, virtual void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2, PropagatorField &q_in_2,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type, Current curr_type,
unsigned int mu)=0; unsigned int mu)
{assert(0);};
virtual void SeqConservedCurrent(PropagatorField &q_in, virtual void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type, Current curr_type,
unsigned int mu, unsigned int mu,
unsigned int tmin, unsigned int tmin,
unsigned int tmax, unsigned int tmax,
ComplexField &lattice_cmplx)=0; ComplexField &lattice_cmplx)
{assert(0);};
// Only reimplemented in Wilson5D // Only reimplemented in Wilson5D
// Default to just a zero correlation function // Default to just a zero correlation function

View File

@ -38,6 +38,7 @@ public:
static const bool isFundamental = Representation::isFundamental; static const bool isFundamental = Representation::isFundamental;
static const int Nhcs = Options::Nhcs; static const int Nhcs = Options::Nhcs;
static const bool LsVectorised=false; static const bool LsVectorised=false;
static const bool isGparity=true;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl; typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
@ -46,7 +47,7 @@ public:
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL; typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Dimension>, Ns>, Ngp>; template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Dimension>, Ns>, Ngp>;
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>, Ngp>; template <typename vtype> using iImplPropagator = iMatrix<iMatrix<iMatrix<vtype, Dimension>, Ns>, Ngp>;
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhs>, Ngp>; template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhs>, Ngp>;
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>; template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>; template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
@ -80,6 +81,7 @@ public:
{ {
assert(0); assert(0);
} }
template<class _Spinor> template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi, static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
@ -94,11 +96,11 @@ public:
int sl = St._simd_layout[direction]; int sl = St._simd_layout[direction];
Coordinate icoor; Coordinate icoor;
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
_Spinor tmp; _Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd(); const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = SIMTlane(Nsimd); int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s); St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd; int mmu = mu % Nd;
@ -191,6 +193,16 @@ public:
#endif #endif
} }
template<class _SpinorField>
inline void multLinkField(_SpinorField & out,
const DoubledGaugeField &Umu,
const _SpinorField & phi,
int mu)
{
assert(0);
}
template <class ref> template <class ref>
static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) static accelerator_inline void loadLinkElement(Simd &reg, ref &memory)
{ {
@ -220,15 +232,17 @@ public:
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
Uconj = where(coor==neglink,-Uconj,Uconj); Uconj = where(coor==neglink,-Uconj,Uconj);
} }
auto U_v = U.View(); {
auto Uds_v = Uds.View(); autoView( U_v , U, CpuRead);
auto Uconj_v = Uconj.View(); autoView( Uconj_v , Uconj, CpuRead);
auto Utmp_v= Utmp.View(); autoView( Uds_v , Uds, CpuWrite);
thread_foreach(ss,U_v,{ autoView( Utmp_v, Utmp, CpuWrite);
Uds_v[ss](0)(mu) = U_v[ss](); thread_foreach(ss,U_v,{
Uds_v[ss](1)(mu) = Uconj_v[ss](); Uds_v[ss](0)(mu) = U_v[ss]();
}); Uds_v[ss](1)(mu) = Uconj_v[ss]();
});
}
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
Uconj = adj(Cshift(Uconj,mu,-1)); Uconj = adj(Cshift(Uconj,mu,-1));
@ -238,19 +252,25 @@ public:
Utmp = where(coor==0,Uconj,Utmp); Utmp = where(coor==0,Uconj,Utmp);
} }
thread_foreach(ss,Utmp_v,{ {
Uds_v[ss](0)(mu+4) = Utmp_v[ss](); autoView( Uds_v , Uds, CpuWrite);
}); autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
});
}
Utmp = Uconj; Utmp = Uconj;
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
Utmp = where(coor==0,U,Utmp); Utmp = where(coor==0,U,Utmp);
} }
thread_foreach(ss,Utmp_v,{ {
Uds_v[ss](1)(mu+4) = Utmp_v[ss](); autoView( Uds_v , Uds, CpuWrite);
}); autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
});
}
} }
} }
@ -260,11 +280,14 @@ public:
GaugeLinkField link(mat.Grid()); GaugeLinkField link(mat.Grid());
// use lorentz for flavour as hack. // use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
auto link_v = link.View();
auto tmp_v = tmp.View(); {
thread_foreach(ss,tmp_v,{ autoView( link_v , link, CpuWrite);
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); autoView( tmp_v , tmp, CpuRead);
}); thread_foreach(ss,tmp_v,{
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
});
}
PokeIndex<LorentzIndex>(mat, link, mu); PokeIndex<LorentzIndex>(mat, link, mu);
return; return;
} }
@ -294,16 +317,18 @@ public:
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); {
auto Atilde_v = Atilde.View(); autoView( tmp_v , tmp, CpuWrite);
auto Btilde_v = Btilde.View(); autoView( Atilde_v , Atilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{ autoView( Btilde_v , Btilde, CpuRead);
for (int s = 0; s < Ls; s++) { thread_for(ss,tmp.Grid()->oSites(),{
int sF = s + Ls * ss; for (int s = 0; s < Ls; s++) {
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF])); int sF = s + Ls * ss;
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
} tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}); }
});
}
PokeIndex<LorentzIndex>(mat, tmp, mu); PokeIndex<LorentzIndex>(mat, tmp, mu);
return; return;
} }

View File

@ -71,8 +71,8 @@ public:
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
RealD M(const FermionField &in, FermionField &out); void M(const FermionField &in, FermionField &out);
RealD Mdag(const FermionField &in, FermionField &out); void Mdag(const FermionField &in, FermionField &out);
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// half checkerboard operations // half checkerboard operations
@ -185,10 +185,12 @@ public:
void ContractConservedCurrent(PropagatorField &q_in_1, void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2, PropagatorField &q_in_2,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &src,
Current curr_type, Current curr_type,
unsigned int mu); unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in, void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &srct,
Current curr_type, Current curr_type,
unsigned int mu, unsigned int mu,
unsigned int tmin, unsigned int tmin,

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -62,8 +61,8 @@ public:
double DhopCalls; double DhopCalls;
double DhopCommTime; double DhopCommTime;
double DhopComputeTime; double DhopComputeTime;
double DhopComputeTime2; double DhopComputeTime2;
double DhopFaceTime; double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
@ -74,8 +73,8 @@ public:
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
// full checkerboard operations; leave unimplemented as abstract for now // full checkerboard operations; leave unimplemented as abstract for now
RealD M (const FermionField &in, FermionField &out); void M (const FermionField &in, FermionField &out);
RealD Mdag (const FermionField &in, FermionField &out); void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operations // half checkerboard operations
void Meooe (const FermionField &in, FermionField &out); void Meooe (const FermionField &in, FermionField &out);
@ -217,15 +216,17 @@ public:
void ContractConservedCurrent(PropagatorField &q_in_1, void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2, PropagatorField &q_in_2,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &src,
Current curr_type, Current curr_type,
unsigned int mu); unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in, void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &src,
Current curr_type, Current curr_type,
unsigned int mu, unsigned int mu,
unsigned int tmin, unsigned int tmin,
unsigned int tmax, unsigned int tmax,
ComplexField &lattice_cmplx); ComplexField &lattice_cmplx);
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -56,8 +56,8 @@ public:
virtual void DtildeInv (const FermionField& in, FermionField& out); virtual void DtildeInv (const FermionField& in, FermionField& out);
// override multiply // override multiply
virtual RealD M (const FermionField& in, FermionField& out); virtual void M (const FermionField& in, FermionField& out);
virtual RealD Mdag (const FermionField& in, FermionField& out); virtual void Mdag (const FermionField& in, FermionField& out);
// half checkerboard operations // half checkerboard operations
virtual void Mooee (const FermionField& in, FermionField& out); virtual void Mooee (const FermionField& in, FermionField& out);

View File

@ -59,7 +59,7 @@ public:
{ {
RealD eps = 1.0; RealD eps = 1.0;
std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl; // std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
assert(zdata->n==this->Ls); assert(zdata->n==this->Ls);

View File

@ -0,0 +1,194 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
#define GRID_QCD_NAIVE_STAG_FERMION_H
NAMESPACE_BEGIN(Grid);
class NaiveStaggeredFermionStatic {
public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 8;
};
template <class Impl>
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
void M(const FermionField &in, FermionField &out);
void Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out);
void MeooeDag(const FermionField &in, FermionField &out);
void Mooee(const FermionField &in, FermionField &out);
void MooeeDag(const FermionField &in, FermionField &out);
void MooeeInv(const FermionField &in, FermionField &out);
void MooeeInvDag(const FermionField &in, FermionField &out);
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop (const FermionField &in, FermionField &out, int dag);
void DhopOE(const FermionField &in, FermionField &out, int dag);
void DhopEO(const FermionField &in, FermionField &out, int dag);
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl &st,
DoubledGaugeField &U,
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
NaiveStaggeredFermion(GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGauge (const GaugeField &_U );
DoubledGaugeField &GetU(void) { return Umu ; } ;
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;
GridBase *_grid;
GridBase *_cbgrid;
// Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &srct,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
NAMESPACE_END(Grid);
#endif

View File

@ -47,8 +47,8 @@ public:
void M_internal(const FermionField &in, FermionField &out,int dag); void M_internal(const FermionField &in, FermionField &out,int dag);
// override multiply // override multiply
virtual RealD M (const FermionField &in, FermionField &out); virtual void M (const FermionField &in, FermionField &out);
virtual RealD Mdag (const FermionField &in, FermionField &out); virtual void Mdag (const FermionField &in, FermionField &out);
// half checkerboard operaions // half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out); virtual void Meooe (const FermionField &in, FermionField &out);

View File

@ -47,23 +47,34 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base; typedef FermionOperator<Impl> Base;
public: public:
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
protected:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels // Generic Nc kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
@ -71,15 +82,18 @@ public:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels // Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
@ -87,27 +101,10 @@ public:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels // Asm Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
public: public:

View File

@ -113,20 +113,7 @@ public:
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu) inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
{ {
GridBase *GaugeGrid = U_ds.Grid(); assert(0);
thread_for(lidx, GaugeGrid->lSites(),{
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, U_ds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
});
} }
inline void DoubleStore(GridBase *GaugeGrid, inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &UUUds, // for Naik term

View File

@ -109,9 +109,8 @@ public:
ImportGauge(_Umu); ImportGauge(_Umu);
} }
virtual RealD M(const FermionField &in, FermionField &out); virtual void M(const FermionField &in, FermionField &out);
virtual RealD Mdag(const FermionField &in, FermionField &out); virtual void Mdag(const FermionField &in, FermionField &out);
virtual void Mooee(const FermionField &in, FermionField &out); virtual void Mooee(const FermionField &in, FermionField &out);
virtual void MooeeDag(const FermionField &in, FermionField &out); virtual void MooeeDag(const FermionField &in, FermionField &out);
virtual void MooeeInv(const FermionField &in, FermionField &out); virtual void MooeeInv(const FermionField &in, FermionField &out);
@ -258,15 +257,16 @@ private:
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12 // eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices // using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F) CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{ {
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v,T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@ -282,9 +282,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v, T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v, F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()(); T_v[i]()(1, 0) = F_v[i]()();
@ -300,9 +300,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v,T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()()); T_v[i]()(1, 1) = timesI(F_v[i]()());
@ -318,9 +318,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v , T, AcceleratorWrite);
auto F_v = F.View(); autoView( F_v , F, AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()()); T_v[i]()(1, 0) = timesI(F_v[i]()());
@ -336,9 +336,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v ,T,AcceleratorWrite);
auto F_v = F.View(); autoView( F_v ,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()()); T_v[i]()(1, 0) = (F_v[i]()());
@ -355,9 +355,9 @@ private:
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v , T,AcceleratorWrite);
auto F_v = F.View(); autoView( F_v , F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); T_v[i]()(1, 1) = timesMinusI(F_v[i]()());

View File

@ -92,8 +92,8 @@ public:
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
virtual RealD M(const FermionField &in, FermionField &out); virtual void M(const FermionField &in, FermionField &out);
virtual RealD Mdag(const FermionField &in, FermionField &out); virtual void Mdag(const FermionField &in, FermionField &out);
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// half checkerboard operations // half checkerboard operations
@ -193,15 +193,17 @@ public:
void ContractConservedCurrent(PropagatorField &q_in_1, void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2, PropagatorField &q_in_2,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type, Current curr_type,
unsigned int mu); unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in, void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type, Current curr_type,
unsigned int mu, unsigned int mu,
unsigned int tmin, unsigned int tmin,
unsigned int tmax, unsigned int tmax,
ComplexField &lattice_cmplx); ComplexField &lattice_cmplx);
}; };
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -99,8 +98,8 @@ public:
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;} GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
// full checkerboard operations; leave unimplemented as abstract for now // full checkerboard operations; leave unimplemented as abstract for now
virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;}; virtual void M (const FermionField &in, FermionField &out){assert(0);};
virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;}; virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now // half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);}; virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
@ -217,25 +216,7 @@ public:
// Comms buffer // Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
void ContractJ5q(FermionField &q_in,ComplexField &J5q);
}; };

View File

@ -41,6 +41,7 @@ public:
static const int Dimension = Representation::Dimension; static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental; static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=false; static const bool LsVectorised=false;
static const bool isGparity=false;
static const int Nhcs = Options::Nhcs; static const int Nhcs = Options::Nhcs;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl; typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@ -98,8 +99,21 @@ public:
{ {
multLink(phi,U,chi,mu); multLink(phi,U,chi,mu);
} }
template<class _SpinorField>
inline void multLinkField(_SpinorField & out,
const DoubledGaugeField &Umu,
const _SpinorField & phi,
int mu)
{
autoView( out_v, out, AcceleratorWrite);
autoView( phi_v, phi, AcceleratorRead);
autoView( Umu_v, Umu, AcceleratorRead);
accelerator_for(sss,out.Grid()->oSites(),1,{
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
});
}
template <class ref> template <class ref>
static accelerator_inline void loadLinkElement(Simd &reg, ref &memory) static accelerator_inline void loadLinkElement(Simd &reg, ref &memory)
{ {
@ -177,18 +191,19 @@ public:
int Ls=Btilde.Grid()->_fdimensions[0]; int Ls=Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); {
auto Btilde_v = Btilde.View(); autoView( tmp_v , tmp, AcceleratorWrite);
auto Atilde_v = Atilde.View(); autoView( Btilde_v , Btilde, AcceleratorRead);
thread_for(sss,tmp.Grid()->oSites(),{ autoView( Atilde_v , Atilde, AcceleratorRead);
int sU=sss; accelerator_for(sss,tmp.Grid()->oSites(),1,{
for(int s=0;s<Ls;s++){ int sU=sss;
int sF = s+Ls*sU; for(int s=0;s<Ls;s++){
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here int sF = s+Ls*sU;
} tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
}); }
});
}
PokeIndex<LorentzIndex>(mat,tmp,mu); PokeIndex<LorentzIndex>(mat,tmp,mu);
} }
}; };

View File

@ -66,41 +66,6 @@ public:
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf, static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma); int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
//////////////////////////////////////////////////////////////////////////////
// Utilities for inserting Wilson conserved current.
//////////////////////////////////////////////////////////////////////////////
static void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeFieldView &U,
unsigned int sU,
unsigned int mu,
bool switch_sign = false);
static void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeFieldView &U,
unsigned int sU,
unsigned int mu,
bool switch_sign = false);
static void SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeFieldView &U,
unsigned int sU,
unsigned int mu,
vPredicate t_mask,
bool switch_sign = false);
static void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeFieldView &U,
unsigned int sU,
unsigned int mu,
vPredicate t_mask,
bool switch_sign = false);
private: private:
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf, static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,

View File

@ -120,7 +120,8 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
} }
} }
virtual RealD M(const FermionField &in, FermionField &out) { virtual void M(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerNo); this->Dhop(in, out, DaggerNo);
FermionField tmp(out.Grid()); FermionField tmp(out.Grid());
@ -129,11 +130,12 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
ComplexD b(0.0,this->mu[s]); ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(tmp,a,in,b,in,s,s); axpbg5y_ssp(tmp,a,in,b,in,s,s);
} }
return axpy_norm(out, 1.0, tmp, out); axpy(out, 1.0, tmp, out);
} }
// needed for fast PV // needed for fast PV
void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) { void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu)
{
assert(_mass.size() == _mu.size()); assert(_mass.size() == _mu.size());
assert(_mass.size() == this->FermionGrid()->_fdimensions[0]); assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
this->mass = _mass; this->mass = _mass;

View File

@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
#ifdef GRID_NVCC #ifdef GRID_CUDA
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
@ -323,7 +323,7 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
} }
template<class Impl> template<class Impl>
RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -335,11 +335,10 @@ RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
axpby(chi,1.0,1.0,chi,psi); axpby(chi,1.0,1.0,chi,psi);
M5D(psi,chi); M5D(psi,chi);
return(norm2(chi));
} }
template<class Impl> template<class Impl>
RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
{ {
// Under adjoint // Under adjoint
//D1+ D1- P- -> D1+^dag P+ D2-^dag //D1+ D1- P- -> D1+^dag P+ D2-^dag
@ -354,7 +353,6 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
M5Ddag(psi,chi); M5Ddag(psi,chi);
// ((b D_W + D_w hop terms +1) on s-diag // ((b D_W + D_w hop terms +1) on s-diag
axpby (chi,1.0,1.0,chi,psi); axpby (chi,1.0,1.0,chi,psi);
return norm2(chi);
} }
// half checkerboard operations // half checkerboard operations
@ -588,6 +586,356 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); // this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
} }
template <class Impl>
void CayleyFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
{
conformable(this->GaugeGrid(), J5q.Grid());
conformable(q_in.Grid(), this->FermionGrid());
Gamma G5(Gamma::Algebra::Gamma5);
// 4d field
int Ls = this->Ls;
FermionField psi(this->GaugeGrid());
FermionField p_plus (this->GaugeGrid());
FermionField p_minus(this->GaugeGrid());
FermionField p(this->GaugeGrid());
ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
ExtractSlice(p_minus, q_in, Ls/2 , 0);
p_plus = p_plus + G5*p_plus;
p_minus= p_minus - G5*p_minus;
p=0.5*(p_plus+p_minus);
J5q = localInnerProduct(p,p);
}
template <class Impl>
void CayleyFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
{
conformable(this->GaugeGrid(), J5q.Grid());
conformable(q_in.Grid(), this->FermionGrid());
Gamma G5(Gamma::Algebra::Gamma5);
// 4d field
int Ls = this->Ls;
PropagatorField psi(this->GaugeGrid());
PropagatorField p_plus (this->GaugeGrid());
PropagatorField p_minus(this->GaugeGrid());
PropagatorField p(this->GaugeGrid());
ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
ExtractSlice(p_minus, q_in, Ls/2 , 0);
p_plus = p_plus + G5*p_plus;
p_minus= p_minus - G5*p_minus;
p=0.5*(p_plus+p_minus);
J5q = localInnerProduct(p,p);
}
#define Pp(Q) (0.5*(Q+g5*Q))
#define Pm(Q) (0.5*(Q-g5*Q))
#define Q_4d(Q) (Pm((Q)[0]) + Pp((Q)[Ls-1]))
#define TopRowWithSource(Q) (phys_src + (1.0-mass)*Q_4d(Q))
template <class Impl>
void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu)
{
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
Gamma::Algebra::Gamma5
};
auto UGrid= this->GaugeGrid();
auto FGrid= this->FermionGrid();
RealD sgn=1.0;
if ( curr_type == Current::Axial ) sgn = -1.0;
int Ls = this->Ls;
std::vector<PropagatorField> L_Q(Ls,UGrid);
std::vector<PropagatorField> R_Q(Ls,UGrid);
for(int s=0;s<Ls;s++){
ExtractSlice(L_Q[s], q_in_1, s , 0);
ExtractSlice(R_Q[s], q_in_2, s , 0);
}
Gamma g5(Gamma::Algebra::Gamma5);
PropagatorField C(UGrid);
PropagatorField p5d(UGrid);
PropagatorField us_p5d(UGrid);
PropagatorField gp5d(UGrid);
PropagatorField gus_p5d(UGrid);
PropagatorField L_TmLsGq0(UGrid);
PropagatorField L_TmLsTmp(UGrid);
PropagatorField R_TmLsGq0(UGrid);
PropagatorField R_TmLsTmp(UGrid);
{
PropagatorField TermA(UGrid);
PropagatorField TermB(UGrid);
PropagatorField TermC(UGrid);
PropagatorField TermD(UGrid);
TermA = (Pp(Q_4d(L_Q)));
TermB = (Pm(Q_4d(L_Q)));
TermC = (Pm(TopRowWithSource(L_Q)));
TermD = (Pp(TopRowWithSource(L_Q)));
L_TmLsGq0 = (TermD - TermA + TermB);
L_TmLsTmp = (TermC - TermB + TermA);
TermA = (Pp(Q_4d(R_Q)));
TermB = (Pm(Q_4d(R_Q)));
TermC = (Pm(TopRowWithSource(R_Q)));
TermD = (Pp(TopRowWithSource(R_Q)));
R_TmLsGq0 = (TermD - TermA + TermB);
R_TmLsTmp = (TermC - TermB + TermA);
}
std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
std::vector<PropagatorField> L_TmLsGq(Ls,UGrid);
for(int s=0;s<Ls;s++){
R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
L_TmLsGq[s] = (Pm((L_Q)[(s)]) + Pp((L_Q)[((s)-1+Ls)%Ls]));
}
Gamma gmu=Gamma(Gmu[mu]);
q_out = Zero();
PropagatorField tmp(UGrid);
for(int s=0;s<Ls;s++){
int sp = (s+1)%Ls;
int sr = Ls-1-s;
int srp= (sr+1)%Ls;
// Mobius parameters
auto b=this->bs[s];
auto c=this->cs[s];
auto bpc = 1.0/(b+c); // -0.5 factor in gauge links
if (s == 0) {
p5d =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp) + c*Pm(L_TmLsTmp ));
tmp =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
} else if (s == Ls-1) {
p5d =(b*Pm(L_TmLsGq0) + c*Pp(L_TmLsGq0 ) + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
tmp =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
} else {
p5d =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr])+ b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
tmp =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
}
tmp = Cshift(tmp,mu,1);
Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
gp5d=g5*p5d*g5;
gus_p5d=gmu*us_p5d;
C = bpc*(adj(gp5d)*us_p5d);
C-= bpc*(adj(gp5d)*gus_p5d);
if (s == 0) {
p5d =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
tmp =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp) + c*Pm(L_TmLsTmp ));
} else if (s == Ls-1) {
p5d =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
tmp =(b*Pm(L_TmLsGq0) + c*Pp(L_TmLsGq0 ) + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
} else {
p5d =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
tmp =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr]) + b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
}
tmp = Cshift(tmp,mu,1);
Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
gp5d=gmu*p5d;
gus_p5d=g5*us_p5d*g5;
C-= bpc*(adj(gus_p5d)*gp5d);
C-= bpc*(adj(gus_p5d)*p5d);
if (s < Ls/2) q_out += sgn*C;
else q_out += C;
}
#endif
}
template <class Impl>
void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &ph)// Complex phase factor
{
assert(mu>=0);
assert(mu<Nd);
#if 0
int tshift = (mu == Nd-1) ? 1 : 0;
////////////////////////////////////////////////
// SHAMIR CASE
////////////////////////////////////////////////
int Ls = this->Ls;
auto UGrid= this->GaugeGrid();
auto FGrid= this->FermionGrid();
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
Gamma gmu=Gamma(Gmu[mu]);
PropagatorField L_Q(UGrid);
PropagatorField R_Q(UGrid);
PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid);
LatticeInteger zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for (int s=0;s<Ls;s++) {
RealD G_s = (curr_type == Current::Axial ) ? ((s < Ls/2) ? -1 : 1) : 1;
ExtractSlice(R_Q, q_in, s , 0);
tmp = Cshift(R_Q,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = G_s*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
tmp = where((lcoor<=tmax),tmp,zz);
L_Q = tmp;
tmp = R_Q*ph;
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd);// Adjoint link
tmp = -G_s*( Utmp + gmu*Utmp );
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
tmp = where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
L_Q= L_Q+tmp;
InsertSlice(L_Q, q_out, s , 0);
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0;
////////////////////////////////////////////////
// GENERAL CAYLEY CASE
////////////////////////////////////////////////
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
Gamma::Algebra::Gamma5
};
Gamma gmu=Gamma(Gmu[mu]);
Gamma g5(Gamma::Algebra::Gamma5);
int Ls = this->Ls;
auto UGrid= this->GaugeGrid();
auto FGrid= this->FermionGrid();
std::vector<PropagatorField> R_Q(Ls,UGrid);
PropagatorField L_Q(UGrid);
PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid);
LatticeInteger zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for(int s=0;s<Ls;s++){
ExtractSlice(R_Q[s], q_in, s , 0);
}
PropagatorField R_TmLsGq0(UGrid);
PropagatorField R_TmLsTmp(UGrid);
{
PropagatorField TermA(UGrid);
PropagatorField TermB(UGrid);
PropagatorField TermC(UGrid);
PropagatorField TermD(UGrid);
TermA = (Pp(Q_4d(R_Q)));
TermB = (Pm(Q_4d(R_Q)));
TermC = (Pm(TopRowWithSource(R_Q)));
TermD = (Pp(TopRowWithSource(R_Q)));
R_TmLsGq0 = (TermD - TermA + TermB);
R_TmLsTmp = (TermC - TermB + TermA);
}
std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
for(int s=0;s<Ls;s++){
R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
}
std::vector<RealD> G_s(Ls,1.0);
if ( curr_type == Current::Axial ) {
for(int s=0;s<Ls/2;s++){
G_s[s] = -1.0;
}
}
for(int s=0;s<Ls;s++){
int sp = (s+1)%Ls;
int sr = Ls-1-s;
int srp= (sr+1)%Ls;
// Mobius parameters
auto b=this->bs[s];
auto c=this->cs[s];
// auto bpc = G_s[s]*1.0/(b+c); // -0.5 factor in gauge links
if (s == 0) {
tmp =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
} else if (s == Ls-1) {
tmp =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
} else {
tmp =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
}
tmp = Cshift(tmp,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated
if (s == 0) {
tmp =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
} else if (s == Ls-1) {
tmp =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
} else {
tmp =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp])+ c*Pm(R_TmLsGq[sp]));
}
tmp = tmp *ph;
tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -G_s[s]*( Utmp + gmu*Utmp );
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
InsertSlice(L_Q, q_out, s , 0);
}
#endif
}
#undef Pp
#undef Pm
#undef Q_4d
#undef TopRowWithSource
#if 0 #if 0
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv, void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,

View File

@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
{ {
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
int Ls=this->Ls; int Ls=this->Ls;
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
int Ls=this->Ls; int Ls=this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
auto plee = & lee [0]; auto plee = & lee [0];
auto pdee = & dee [0]; auto pdee = & dee [0];

View File

@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi, psi_i,CpuRead);
auto phi = phi_i.View(); autoView(phi, phi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi, chi_i,CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
const int nsimd= Simd::Nsimd(); const int nsimd= Simd::Nsimd();
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi=psi_i.View(); autoView(psi,psi_i,CpuRead);
auto phi=phi_i.View(); autoView(phi,phi_i,CpuRead);
auto chi=chi_i.View(); autoView(chi,chi_i,CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd(); int nsimd= Simd::Nsimd();
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
Vector<iSinglet<Simd> > &Matm) Vector<iSinglet<Simd> > &Matm)
{ {
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
#ifndef AVX512 #ifndef AVX512
{ {
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
EnableIf<Impl::LsVectorised,int> sfinae=0; EnableIf<Impl::LsVectorised,int> sfinae=0;
#ifndef AVX512 #ifndef AVX512
{ {
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM; SiteHalfSpinor BcastM;
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
} }
#else #else
{ {
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
// pointers // pointers
// MASK_REGS; // MASK_REGS;
#define Chi_00 %zmm0 #define Chi_00 %zmm0

View File

@ -94,7 +94,7 @@ void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Ap
template<class Impl> template<class Impl>
RealD ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, FermionField &chi) void ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
{ {
int Ls = this->Ls; int Ls = this->Ls;
@ -116,15 +116,14 @@ RealD ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, F
} }
sign=-sign; sign=-sign;
} }
return norm2(chi);
} }
template<class Impl> template<class Impl>
RealD ContinuedFractionFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi) void ContinuedFractionFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
{ {
// This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag // This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
// The rest of matrix is symmetric. // The rest of matrix is symmetric.
// Can ignore "dag" // Can ignore "dag"
return M(psi,chi); M(psi,chi);
} }
template<class Impl> template<class Impl>
void ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ void ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){

View File

@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
int Ls = this->Ls; int Ls = this->Ls;
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto phi = phi_i.View(); autoView( phi , phi_i, AcceleratorRead);
auto psi = psi_i.View(); autoView( psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView( psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView( phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi=psi_i.View(); autoView( psi, psi_i, AcceleratorRead);
auto chi=chi_i.View(); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi = psi_i.View(); autoView( psi, psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];

View File

@ -89,7 +89,7 @@ void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionFiel
/*****************************************************************************************************/ /*****************************************************************************************************/
template<class Impl> template<class Impl>
RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi) void DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -97,11 +97,10 @@ RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
this->DW(Din, chi, DaggerNo); this->DW(Din, chi, DaggerNo);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
this->M5D(psi, chi); this->M5D(psi, chi);
return(norm2(chi));
} }
template<class Impl> template<class Impl>
RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi) void DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -109,7 +108,6 @@ RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& c
this->MeooeDag5D(Din, chi); this->MeooeDag5D(Din, chi);
this->M5Ddag(psi, chi); this->M5Ddag(psi, chi);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
return(norm2(chi));
} }
/******************************************************************** /********************************************************************

View File

@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
auto Umu_v = Umu.View(); autoView( Umu_v , Umu, CpuRead);
auto UUUmu_v = UUUmu.View(); autoView( UUUmu_v , UUUmu, CpuRead);
auto in_v = in.View(); autoView( in_v , in, CpuRead);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for( ss,Umu.Grid()->oSites(),{ thread_for( ss,Umu.Grid()->oSites(),{
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
} }
@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor; Compressor compressor;
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor // st.HaloExchangeOptGather(in,compressor); // Wilson compressor
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons. // Remove explicit thread mapping introduced for OPA reasons.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime) DhopComputeTime-=usecond();
{ {
int tid = omp_get_thread_num(); int interior=1;
int nthreads = omp_get_num_threads(); int exterior=0;
int ncomms = CartesianCommunicator::nCommThreads; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U.Grid()->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
} }
DhopCommTime += ctime; DhopComputeTime+=usecond();
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
DhopComputeTime2-=usecond(); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
auto U_v = U.View(); DhopComputeTime2-=usecond();
auto UUU_v = UUU.View(); {
auto in_v = in.View(); int interior=0;
auto out_v = out.View(); int exterior=1;
if (dag == DaggerYes) { Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
});
} else {
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
});
} }
DhopComputeTime2+=usecond(); DhopComputeTime2+=usecond();
#else
assert(0);
#endif
} }
template<class Impl> template<class Impl>
@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
Compressor compressor; Compressor compressor;
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
//double t1=usecond(); //double t1=usecond();
DhopTotalTime -= usecond(); DhopTotalTime -= usecond();
DhopCommTime -= usecond(); DhopCommTime -= usecond();
@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
auto U_v = U.View(); {
auto UUU_v = UUU.View(); int interior=1;
auto in_v = in.View(); int exterior=1;
auto out_v = out.View(); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (dag == DaggerYes) {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
});
} else {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
});
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
} }
/*CHANGE END*/ /*CHANGE END*/
@ -548,21 +470,24 @@ void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vect
assert(0); assert(0);
} }
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo); Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes); Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo); DhopEO(in, out, DaggerNo);
} else { } else {
@ -570,7 +495,8 @@ void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionFiel
} }
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes); DhopEO(in, out, DaggerYes);
} else { } else {
@ -579,27 +505,30 @@ void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionF
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass); typename FermionField::scalar_type scal(mass);
out = scal * in; out = scal * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Mooee(in, out); Mooee(in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in; out = (1.0 / (mass)) * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in, void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,FermionField &out)
FermionField &out) { {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out); MooeeInv(in, out);
} }
@ -611,6 +540,7 @@ template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1, void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2, PropagatorField &q_in_2,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &src,
Current curr_type, Current curr_type,
unsigned int mu) unsigned int mu)
{ {
@ -620,11 +550,12 @@ void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &src,
Current curr_type, Current curr_type,
unsigned int mu, unsigned int mu,
unsigned int tmin, unsigned int tmin,
unsigned int tmax, unsigned int tmax,
ComplexField &lattice_cmplx) ComplexField &lattice_cmplx)
{ {
assert(0); assert(0);

View File

@ -171,21 +171,24 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
///////////////////////////// /////////////////////////////
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo); Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes); Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out); axpy(out, mass, in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo); DhopEO(in, out, DaggerNo);
} else { } else {
@ -193,7 +196,8 @@ void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField
} }
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) { if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes); DhopEO(in, out, DaggerYes);
} else { } else {
@ -202,27 +206,30 @@ void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionFie
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass); typename FermionField::scalar_type scal(mass);
out = scal * in; out = scal * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
Mooee(in, out); Mooee(in, out);
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) { void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in; out = (1.0 / (mass)) * in;
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,FermionField &out)
FermionField &out) { {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out); MooeeInv(in, out);
} }
@ -234,7 +241,8 @@ void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU,
GaugeField & mat, GaugeField & mat,
const FermionField &A, const FermionField &B, int dag) { const FermionField &A, const FermionField &B, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor; Compressor compressor;
@ -250,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
//////////////////////// ////////////////////////
// Call the single hop // Call the single hop
//////////////////////// ////////////////////////
auto U_v = U.View(); autoView( U_v , U, CpuRead);
auto UUU_v = UUU.View(); autoView( UUU_v , UUU, CpuRead);
auto B_v = B.View(); autoView( B_v , B, CpuWrite);
auto Btilde_v = Btilde.View(); autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{ thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
}); });
@ -284,8 +292,8 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _grid); conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid()); conformable(U.Grid(), mat.Grid());
@ -296,8 +304,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionFie
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _cbgrid); conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid()); conformable(U.Grid(), mat.Grid());
@ -310,8 +318,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
} }
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
conformable(U.Grid(), _cbgrid); conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid()); conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid()); conformable(U.Grid(), mat.Grid());
@ -378,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in, compressor); Stencil.HaloExchange(in, compressor);
auto Umu_v = Umu.View(); autoView( Umu_v , Umu, CpuRead);
auto UUUmu_v = UUUmu.View(); autoView( UUUmu_v , UUUmu, CpuRead);
auto in_v = in.View(); autoView( in_v , in, CpuRead);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for( sss, in.Grid()->oSites(),{ thread_for( sss, in.Grid()->oSites(),{
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
}); });
@ -395,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
} }
template <class Impl> template <class Impl>
@ -409,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
#ifdef GRID_OMP
Compressor compressor; Compressor compressor;
int len = U.Grid()->oSites(); int len = U.Grid()->oSites();
@ -418,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopFaceTime -= usecond(); DhopFaceTime -= usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
DhopFaceTime += usecond(); DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons. // Removed explicit thread comms
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
#pragma omp parallel
{ {
int tid = omp_get_thread_num(); int interior=1;
int nthreads = omp_get_num_threads(); int exterior=0;
int ncomms = CartesianCommunicator::nCommThreads; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
}
} else {
st.CommunicateThreaded();
}
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing // First to enter, last to leave timing
DhopFaceTime -= usecond(); DhopFaceTime -= usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
@ -479,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopComputeTime2 -= usecond(); DhopComputeTime2 -= usecond();
{ {
auto U_v = U.View(); int interior=0;
auto UUU_v = UUU.View(); int exterior=1;
auto in_v = in.View(); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
auto out_v = out.View();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
} else {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
}
} }
DhopComputeTime2 += usecond(); DhopComputeTime2 += usecond();
#else
assert(0);
#endif
} }
@ -520,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond(); DhopCommTime += usecond();
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
if (dag == DaggerYes) { {
thread_for(sss, in.Grid()->oSites(),{ int interior=1;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); int exterior=1;
}); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} else {
thread_for(sss, in.Grid()->oSites(),{
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
});
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();
@ -600,6 +550,7 @@ template <class Impl>
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1, void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2, PropagatorField &q_in_2,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &src,
Current curr_type, Current curr_type,
unsigned int mu) unsigned int mu)
{ {
@ -609,6 +560,7 @@ void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in, void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out, PropagatorField &q_out,
PropagatorField &src,
Current curr_type, Current curr_type,
unsigned int mu, unsigned int mu,
unsigned int tmin, unsigned int tmin,

View File

@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
auto plee = & this->lee [0]; auto plee = & this->lee [0];
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto pm = this->pm; auto pm = this->pm;

View File

@ -166,7 +166,7 @@ void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& c
/*****************************************************************************************************/ /*****************************************************************************************************/
template<class Impl> template<class Impl>
RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi) void MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -174,11 +174,10 @@ RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
this->DW(Din, chi, DaggerNo); this->DW(Din, chi, DaggerNo);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
this->M5D(psi, chi); this->M5D(psi, chi);
return(norm2(chi));
} }
template<class Impl> template<class Impl>
RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi) void MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
{ {
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
@ -186,7 +185,6 @@ RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
this->MeooeDag5D(Din, chi); this->MeooeDag5D(Din, chi);
this->M5Ddag(psi, chi); this->M5Ddag(psi, chi);
axpby(chi, 1.0, 1.0, chi, psi); axpby(chi, 1.0, 1.0, chi, psi);
return(norm2(chi));
} }
/******************************************************************** /********************************************************************

View File

@ -0,0 +1,499 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////
// Constructor and gauge import
/////////////////////////////////
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid)
{
int vol4;
int LLs=1;
c1=_c1;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
{
ImportGauge(_U);
}
////////////////////////////////////////////////////////////
// Momentum space propagator should be
// https://arxiv.org/pdf/hep-lat/9712010.pdf
//
// mom space action.
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
//
// must track through staggered flavour/spin reduction in literature to
// turn to free propagator for the one component chi field, a la page 4/5
// of above link to implmement fourier based solver.
////////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
{
GaugeLinkField U(GaugeGrid());
DoubledGaugeField _UUU(GaugeGrid());
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
// Discard teh Naik as Naive
////////////////////////////////////////////////////////
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
////////////////////////////////////////////////////////
// Apply scale factors to get the right fermion Kinetic term
// Could pass coeffs into the double store to save work.
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
}
CopyGaugeCheckerboards();
}
/////////////////////////////
// Implement the interface
/////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass);
out = scal * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out);
}
///////////////////////////////////
// Internal
///////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField & mat,
const FermionField &A, const FermionField &B, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor;
FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid());
Atilde = A;
st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) {
////////////////////////
// Call the single hop
////////////////////////
autoView( U_v , U, CpuRead);
autoView( B_v , B, CpuWrite);
autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
});
assert(0);// need to figure out the force interface with a blasted three link term.
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
mat.Checkerboard() = U.Checkerboard();
DerivInternal(Stencil, Umu, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Even);
assert(U.Checkerboard() == Odd);
mat.Checkerboard() = Odd;
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Odd);
assert(U.Checkerboard() == Even);
mat.Checkerboard() = Even;
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
{
DhopDir(in, out, dir, disp);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
{
assert(0); // Not implemented yet
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
{
Compressor compressor;
Stencil.HaloExchange(in, compressor);
autoView( Umu_v , Umu, CpuRead);
autoView( in_v , in, CpuRead);
autoView( out_v , out, CpuWrite);
// thread_for( sss, in.Grid()->oSites(),{
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
// });
assert(0);
};
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,lo,U,in,out,dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
Compressor compressor;
int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void NaiveStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
assert(0);
}
NAMESPACE_END(Grid);

View File

@ -269,16 +269,14 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
} }
template<class Impl> template<class Impl>
RealD PartialFractionFermion5D<Impl>::M (const FermionField &in, FermionField &out) void PartialFractionFermion5D<Impl>::M (const FermionField &in, FermionField &out)
{ {
M_internal(in,out,DaggerNo); M_internal(in,out,DaggerNo);
return norm2(out);
} }
template<class Impl> template<class Impl>
RealD PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out) void PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
{ {
M_internal(in,out,DaggerYes); M_internal(in,out,DaggerYes);
return norm2(out);
} }
template<class Impl> template<class Impl>

View File

@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
assert(0); assert(0);
@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge2 =(uint64_t)&UU[sU]( Z ); \
gauge3 =(uint64_t)&UU[sU]( T ); gauge3 =(uint64_t)&UU[sU]( T );
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h> #include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHI(addr0,addr1,addr2,addr3);
@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
} }
#include <Grid/simd/Intel512double.h> #include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dag) int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHI(addr0,addr1,addr2,addr3);
@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h> #include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
int sF=s+LLs*sU; {
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1); LOAD_CHIa(addr0,addr1);
@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
} }
#include <Grid/simd/Intel512double.h> #include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
int sF=s+LLs*sU; {
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1); LOAD_CHIa(addr0,addr1);

View File

@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
skew = 0; skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even); HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG (U,Ym,2,skew,odd); HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even); HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd); HAND_STENCIL_LEG (U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even); HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even); HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
}
if ( dag ) { if ( dag ) {
result()()(0) = - even_0 - odd_0; result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1; result()()(1) = - even_1 - odd_1;
@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even); HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd); HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
}
// Assume every site must be connected to at least one interior point. No 1^4 subvols. // Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) { if ( dag ) {
result()()(0) = - even_0 - odd_0; result()()(0) = - even_0 - odd_0;
@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd); HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
}
// Add sum of all exterior connected stencil legs // Add sum of all exterior connected stencil legs
if ( nmu ) { if ( nmu ) {
if ( dag ) { if ( dag ) {
@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
} }
} }
/*
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ #define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \ SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \ const FermionFieldView &in, FermionFieldView &out, int dag); \
*/
#undef LOAD_CHI #undef LOAD_CHI
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \ if (SE->_is_local ) { \
if (SE->_permute) { \ if (SE->_permute) { \
chi_p = &chi; \ chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \ permute(chi, in[SE->_offset], ptype); \
} else { \ } else { \
chi_p = &in[SE->_offset]; \ chi_p = &in[SE->_offset]; \
} \ } \
} else { \ } else { \
chi_p = &buf[SE->_offset]; \ chi_p = &buf[SE->_offset]; \
@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \ if (SE->_is_local ) { \
if (SE->_permute) { \ if (SE->_permute) { \
chi_p = &chi; \ chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \ permute(chi, in[SE->_offset], ptype); \
} else { \ } else { \
chi_p = &in[SE->_offset]; \ chi_p = &in[SE->_offset]; \
} \ } \
} else if ( st.same_node[Dir] ) { \ } else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \ chi_p = &buf[SE->_offset]; \
} \ } \
if (SE->_is_local || st.same_node[Dir] ) { \ if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U[sU], *chi_p, Dir); \ multLink(Uchi, U[sU], *chi_p, Dir); \
} }
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \ nmu++; \
chi_p = &buf[SE->_offset]; \ chi_p = &buf[SE->_offset]; \
multLink(Uchi, U[sU], *chi_p, Dir); \ multLink(Uchi, U[sU], *chi_p, Dir); \
} }
template <class Impl> template <class Impl>
@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap // Int, Ext, Int+Ext cases for comms overlap
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag) { const FermionFieldView &in, FermionFieldView &out, int dag)
{
const SiteSpinor *chi_p; const SiteSpinor *chi_p;
SiteSpinor chi; SiteSpinor chi;
SiteSpinor Uchi; SiteSpinor Uchi;
@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
int ptype; int ptype;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; //
// int sF=LLs*sU+s;
{
skew = 0; skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink); GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
// Only contributions from interior of our node // Only contributions from interior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) { const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p; const SiteSpinor *chi_p;
SiteSpinor chi; SiteSpinor chi;
@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
int ptype; int ptype;
int skew ; int skew ;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; // int sF=LLs*sU+s;
{
skew = 0; skew = 0;
Uchi=Zero(); Uchi=Zero();
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
// Only contributions from exterior of our node // Only contributions from exterior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) { const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p; const SiteSpinor *chi_p;
// SiteSpinor chi; // SiteSpinor chi;
@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
int nmu=0; int nmu=0;
int skew ; int skew ;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; // int sF=LLs*sU+s;
{
skew = 0; skew = 0;
Uchi=Zero(); Uchi=Zero();
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( nmu ) { if ( nmu ) {
if ( dag ) { if ( dag ) {
out[sF] = out[sF] - Uchi; out[sF] = out[sF] - Uchi;
@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel // Driving / wrapping routine to select right kernel
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
SiteSpinor *buf, int LLs, int sU, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=1;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionFieldView &in, FermionFieldView &out,
int dag,int interior,int exterior)
{
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break;
#endif
case OptHandUnroll:
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
};
template <class Impl>
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
{ {
// Disp should be either +1,-1,+3,-3 // Disp should be either +1,-1,+3,-3
// What about "dag" ? // What about "dag" ?
@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
assert(0); assert(0);
} }
#define KERNEL_CALLNB(A,improved) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
#define ASM_CALL(A) \
const uint64_t NN = Nsite*Ls; \
thread_for( ss, NN, { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
autoView( UUU_v , UUU, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
autoView( UUU_v , U, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -35,7 +35,7 @@ NAMESPACE_BEGIN(Grid);
// *NOT* EO // *NOT* EO
template <class Impl> template <class Impl>
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out) void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
{ {
FermionField temp(out.Grid()); FermionField temp(out.Grid());
@ -47,11 +47,10 @@ RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
Mooee(in, temp); Mooee(in, temp);
out += temp; out += temp;
return norm2(out);
} }
template <class Impl> template <class Impl>
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out) void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{ {
FermionField temp(out.Grid()); FermionField temp(out.Grid());
@ -63,7 +62,6 @@ RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
MooeeDag(in, temp); MooeeDag(in, temp);
out += temp; out += temp;
return norm2(out);
} }
template <class Impl> template <class Impl>
@ -100,46 +98,49 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
Coordinate lcoor; Coordinate lcoor;
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
for (int site = 0; site < lvol; site++)
{ {
grid->LocalIndexToLocalCoor(site, lcoor); autoView(CTv,CloverTerm,CpuRead);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); autoView(CTIv,CloverTermInv,CpuWrite);
peekLocalSite(Qx, CloverTerm, lcoor); for (int site = 0; site < lvol; site++) {
Qxinv = Zero(); grid->LocalIndexToLocalCoor(site, lcoor);
//if (csw!=0){ EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
for (int j = 0; j < Ns; j++) peekLocalSite(Qx, CTv, lcoor);
for (int k = 0; k < Ns; k++) Qxinv = Zero();
for (int a = 0; a < DimRep; a++) //if (csw!=0){
for (int b = 0; b < DimRep; b++){ for (int j = 0; j < Ns; j++)
auto zz = Qx()(j, k)(a, b); for (int k = 0; k < Ns; k++)
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz); for (int a = 0; a < DimRep; a++)
} for (int b = 0; b < DimRep; b++){
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
EigenInvCloverOp = EigenCloverOp.inverse(); }
//std::cout << EigenInvCloverOp << std::endl; // if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++) EigenInvCloverOp = EigenCloverOp.inverse();
for (int a = 0; a < DimRep; a++) //std::cout << EigenInvCloverOp << std::endl;
for (int b = 0; b < DimRep; b++) for (int j = 0; j < Ns; j++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); for (int k = 0; k < Ns; k++)
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; for (int a = 0; a < DimRep; a++)
// } for (int b = 0; b < DimRep; b++)
pokeLocalSite(Qxinv, CloverTermInv, lcoor); Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CTIv, lcoor);
}
} }
// Separate the even and odd parts // Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
} }
template <class Impl> template <class Impl>

View File

@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh // FIXME Need a Lattice acosh
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd); {
Tcomplex cc; autoView(cosha_v,cosha,CpuRead);
// RealD sgn; autoView(a_v,a,CpuWrite);
_grid->LocalIndexToLocalCoor(idx,lcoor); for(int idx=0;idx<_grid->lSites();idx++){
peekLocalSite(cc,cosha,lcoor); Coordinate lcoor(Nd);
assert((double)real(cc)>=1.0); Tcomplex cc;
assert(fabs((double)imag(cc))<=1.0e-15); // RealD sgn;
cc = ScalComplex(::acosh(real(cc)),0.0); _grid->LocalIndexToLocalCoor(idx,lcoor);
pokeLocalSite(cc,a,lcoor); peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}
} }
Wea = ( exp( a) * abs(W) ); Wea = ( exp( a) * abs(W) );
@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh // FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){ for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd); Coordinate lcoor(Nd);
Tcomplex cc; Tcomplex cc;
// RealD sgn; // RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor); _grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha,lcoor); peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0); assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15); assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0); cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a,lcoor); pokeLocalSite(cc,a_v,lcoor);
} }}
Wea = ( exp( a) * abs(W) ); Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) ); Wema= ( exp(-a) * abs(W) );
@ -861,7 +869,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
* Conserved current utilities for Wilson fermions, for contracting propagators * Conserved current utilities for Wilson fermions, for contracting propagators
* to make a conserved current sink or inserting the conserved current * to make a conserved current sink or inserting the conserved current
* sequentially. * sequentially.
******************************************************************************/
// Helper macro to reverse Simd vector. Fixme: slow, generic implementation. // Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
#define REVERSE_LS(qSite, qSiteRev, Nsimd) \ #define REVERSE_LS(qSite, qSiteRev, Nsimd) \
@ -877,220 +884,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
merge(qSiteRev, qSiteVec); \ merge(qSiteRev, qSiteVec); \
} }
// psi = chiralProjectPlus(Result_s[Ls/2-1]); ******************************************************************************/
// psi+= chiralProjectMinus(Result_s[Ls/2]);
// PJ5q+=localInnerProduct(psi,psi);
template<class vobj>
Lattice<vobj> spProj5p(const Lattice<vobj> & in)
{
GridBase *grid=in.Grid();
Gamma G5(Gamma::Algebra::Gamma5);
Lattice<vobj> ret(grid);
auto ret_v = ret.View();
auto in_v = in.View();
thread_for(ss,grid->oSites(),{
ret_v[ss] = in_v[ss] + G5*in_v[ss];
});
return ret;
}
template<class vobj>
Lattice<vobj> spProj5m(const Lattice<vobj> & in)
{
Gamma G5(Gamma::Algebra::Gamma5);
GridBase *grid=in.Grid();
Lattice<vobj> ret(grid);
auto ret_v = ret.View();
auto in_v = in.View();
thread_for(ss,grid->oSites(),{
ret_v[ss] = in_v[ss] - G5*in_v[ss];
});
return ret;
}
template <class Impl>
void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
{
conformable(GaugeGrid(), J5q.Grid());
conformable(q_in.Grid(), FermionGrid());
// 4d field
int Ls = this->Ls;
FermionField psi(GaugeGrid());
FermionField p_plus (GaugeGrid());
FermionField p_minus(GaugeGrid());
FermionField p(GaugeGrid());
ExtractSlice(p_plus , q_in, Ls/2 , 0);
ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
p_plus = spProj5p(p_plus );
p_minus= spProj5m(p_minus);
p=p_plus+p_minus;
J5q = localInnerProduct(p,p);
}
template <class Impl>
void WilsonFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
{
conformable(GaugeGrid(), J5q.Grid());
conformable(q_in.Grid(), FermionGrid());
// 4d field
int Ls = this->Ls;
PropagatorField psi(GaugeGrid());
PropagatorField p_plus (GaugeGrid());
PropagatorField p_minus(GaugeGrid());
PropagatorField p(GaugeGrid());
ExtractSlice(p_plus , q_in, Ls/2 , 0);
ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
p_plus = spProj5p(p_plus );
p_minus= spProj5m(p_minus);
p=p_plus+p_minus;
J5q = localInnerProduct(p,p);
}
template <class Impl>
void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
conformable(q_in_1.Grid(), FermionGrid());
conformable(q_in_1.Grid(), q_in_2.Grid());
conformable(_FourDimGrid, q_out.Grid());
PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
q_out = Zero();
// Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s),
// q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
tmp1 = Cshift(q_in_1, mu + 1, 1);
tmp2 = Cshift(q_in_2, mu + 1, 1);
auto q_in_1_v = q_in_1.View();
auto q_in_2_v = q_in_2.View();
auto tmp1_v = tmp1.View();
auto tmp2_v = tmp2.View();
auto q_out_v = q_out.View();
auto Umu_v = Umu.View();
thread_for(sU, Umu.Grid()->oSites(),{
unsigned int sF1 = sU * LLs;
unsigned int sF2 = (sU + 1) * LLs - 1;
for (unsigned int s = 0; s < LLs; ++s)
{
bool axial_sign = ((curr_type == Current::Axial) && \
(s < (LLs / 2)));
SitePropagator qSite2, qmuSite2;
// If vectorised in 5th dimension, reverse q2 vector to match up
// sites correctly.
if (Impl::LsVectorised)
{
REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
}
else
{
qSite2 = q_in_2_v[sF2];
qmuSite2 = tmp2_v[sF2];
}
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1],
qSite2,
q_out_v[sU],
Umu_v, sU, mu, axial_sign);
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
qmuSite2,
q_out_v[sU],
Umu_v, sU, mu, axial_sign);
sF1++;
sF2--;
}
});
}
template <class Impl>
void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
conformable(q_in.Grid(), FermionGrid());
conformable(q_in.Grid(), q_out.Grid());
PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLs = q_in.Grid()->_rdimensions[0];
unsigned int LLt = GridDefaultLatt()[Tp];
q_out = Zero();
LatticeInteger coords(_FourDimGrid);
LatticeCoordinate(coords, Tp);
auto q_out_v = q_out.View();
auto tmp2_v = tmp2.View();
auto coords_v= coords.View();
auto Umu_v = Umu.View();
for (unsigned int s = 0; s < LLs; ++s)
{
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
bool tadpole_sign = (curr_type == Current::Tadpole);
bool switch_sgn = tadpole_sign || axial_sign;
//forward direction: Need q(x + mu, s)*A(x)
ExtractSlice(tmp2, q_in, s, 0); //q(x,s)
tmp = Cshift(tmp2, mu, 1); //q(x+mu,s)
tmp2 = tmp*lattice_cmplx; //q(x+mu,s)*A(x)
thread_for(sU, Umu.Grid()->oSites(),{
// Compute the sequential conserved current insertion only if our simd
// object contains a timeslice we need.
vPredicate t_mask;
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
Integer timeSlices = Reduce(t_mask());
if (timeSlices > 0)
{
unsigned int sF = sU * LLs + s;
Kernels::SeqConservedCurrentSiteFwd(tmp2_v[sU],
q_out_v[sF], Umu_v, sU,
mu, t_mask, switch_sgn);
}
});
//backward direction: Need q(x - mu, s)*A(x-mu)
ExtractSlice(tmp2, q_in, s, 0); //q(x,s)
tmp = lattice_cmplx*tmp2; //q(x,s)*A(x)
tmp2 = Cshift(tmp, mu, -1); //q(x-mu,s)*A(x-mu,s)
thread_for(sU, Umu.Grid()->oSites(),
{
vPredicate t_mask;
t_mask()= ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift)));
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
unsigned int t0 = 0;
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
Integer timeSlices = Reduce(t_mask());
if (timeSlices > 0) {
unsigned int sF = sU * LLs + s;
Kernels::SeqConservedCurrentSiteBwd(tmp2_v[sU],
q_out_v[sF], Umu_v, sU,
mu, t_mask, axial_sign);
}
});
}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

Some files were not shown because too many files have changed in this diff Show More