1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Merge branch 'develop' into feature/baryon

This commit is contained in:
Raoul Hodgson 2020-06-25 16:24:07 +01:00
commit 39cea8b5a7
139 changed files with 6115 additions and 2632 deletions

View File

@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h> #include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h> #include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/Threads.h> #include <Grid/threads/ThreadReduction.h>
#include <Grid/serialisation/Serialisation.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>

View File

@ -6,6 +6,7 @@
/////////////////// ///////////////////
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory>
#include <vector> #include <vector>
#include <array> #include <array>
#include <string> #include <string>

View File

@ -18,19 +18,20 @@
#pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__") #pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__") #pragma push_macro("__CUDACC__")
#undef __CUDA_ARCH__
#undef __NVCC__ #undef __NVCC__
#undef __CUDACC__ #undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__ #define __NVCC__REDEFINE__
#endif #endif
/* SYCL save and restore compile environment*/ /* SYCL save and restore compile environment*/
#ifdef __SYCL_DEVICE_ONLY__ #ifdef GRID_SYCL
#pragma push #pragma push
#pragma push_macro("__SYCL_DEVICE_ONLY__") #pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__ #undef __SYCL_DEVICE_ONLY__
#undef EIGEN_USE_SYCL
#define EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif #endif
@ -41,7 +42,7 @@
#ifdef __NVCC__REDEFINE__ #ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__") #pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__") #pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__") #pragma pop_macro("GRID_SIMT")
#pragma pop #pragma pop
#endif #endif

View File

@ -21,7 +21,7 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h extra_headers+=serialisation/Hdf5Type.h
endif endif
all: version-cache all: version-cache Version.h
version-cache: version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ @if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@ -42,7 +42,7 @@ version-cache:
fi;\ fi;\
rm -f vertmp rm -f vertmp
Version.h: Version.h: version-cache
cp version-cache Version.h cp version-cache Version.h
.PHONY: version-cache .PHONY: version-cache

View File

@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H #ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h> #include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h> #include <Grid/algorithms/Preconditioner.h>
NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Zolotarev.h> #include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h> #include <Grid/algorithms/approx/Chebyshev.h>
@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Forecast.h> #include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/approx/RemezGeneral.h> #include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h> #include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/iterative/Deflation.h> #include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h> #include <Grid/algorithms/iterative/BiCGSTAB.h>
NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateResidual.h> #include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h> #include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/CoarsenedMatrix.h> #include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
#endif #endif

View File

@ -1,14 +1,3 @@
// blockZaxpy in bockPromote - 3s, 5%
// noncoalesced linalg in Preconditionoer ~ 3s 5%
// Lancos tuning or replace 10-20s ~ 25%, open ended
// setup tuning 5s ~ 8%
// -- e.g. ordermin, orderstep tunables.
// MdagM path without norm in LinOp code. few seconds
// Mdir calc blocking kernels
// Fuse kernels in blockMaskedInnerProduct
// preallocate Vectors in Cayley 5D ~ few percent few seconds
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -91,35 +80,8 @@ public:
} }
directions [2*_d]=0; directions [2*_d]=0;
displacements[2*_d]=0; displacements[2*_d]=0;
//// report back
std::cout<<GridLogMessage<<"directions :";
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
std::cout<<std::endl;
std::cout<<GridLogMessage<<"displacements :";
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
std::cout<<std::endl;
} }
/*
// Original cleaner code
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
for(int d=0;d<dimension;d++){
directions[2*d ] = d;
directions[2*d+1] = d;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*dimension]=0;
displacements[2*dimension]=0;
}
std::vector<int> GetDelta(int point) {
std::vector<int> delta(dimension,0);
delta[directions[point]] = displacements[point];
return delta;
};
*/
}; };
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
@ -149,24 +111,6 @@ public:
CoarseScalar InnerProd(CoarseGrid); CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl; std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace); blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
// blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal();
}
void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace);
eProj=Zero();
accelerator_for(ss, CoarseGrid->oSites(),1,{
eProj[ss](i)=CComplex(1.0);
});
eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
}
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
} }
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){ void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace); blockProject(CoarseVec,FineVec,subspace);
@ -175,11 +119,6 @@ public:
FineVec.Checkerboard() = subspace[0].Checkerboard(); FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace); blockPromote(CoarseVec,FineVec,subspace);
} }
void CreateSubspaceRandom(GridParallelRNG &RNG){
for(int i=0;i<nbasis;i++){
random(RNG,subspace[i]);
}
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) { virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
@ -218,7 +157,7 @@ public:
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found // and this is the best I found
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
#if 1
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop, virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn, int nn,
double hi, double hi,
@ -280,10 +219,10 @@ public:
hermop.HermOp(*Tn,y); hermop.HermOp(*Tn,y);
auto y_v = y.View(); autoView( y_v , y, AcceleratorWrite);
auto Tn_v = Tn->View(); autoView( Tn_v , (*Tn), AcceleratorWrite);
auto Tnp_v = Tnp->View(); autoView( Tnp_v , (*Tnp), AcceleratorWrite);
auto Tnm_v = Tnm->View(); autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
@ -313,201 +252,6 @@ public:
} }
assert(b==nn); assert(b==nn);
} }
#endif
#if 0
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
FineField combined(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
#define FILTERb(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
Cheb(hermop,noise,Mn); \
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
subspace[b] = Mn; \
hermop.Op(Mn,tmp); \
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
RealD alpha=-0.8;
RealD beta =-0.8;
#define FILTER(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
Cheb(hermop,noise,Mn); \
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
subspace[b] = Mn; \
hermop.Op(Mn,tmp); \
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
#define FILTERc(llo,hhi,oorder) \
{ \
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
Cheb(hermop,noise,combined); \
}
double node = 0.000;
FILTERb(lo,hi,orderfilter);// 0
// FILTERc(node,hi,51);// 0
noise = Mn;
int base = 0;
int mult = 100;
FILTER(node,hi,base+1*mult);
FILTER(node,hi,base+2*mult);
FILTER(node,hi,base+3*mult);
FILTER(node,hi,base+4*mult);
FILTER(node,hi,base+5*mult);
FILTER(node,hi,base+6*mult);
FILTER(node,hi,base+7*mult);
FILTER(node,hi,base+8*mult);
FILTER(node,hi,base+9*mult);
FILTER(node,hi,base+10*mult);
FILTER(node,hi,base+11*mult);
FILTER(node,hi,base+12*mult);
FILTER(node,hi,base+13*mult);
FILTER(node,hi,base+14*mult);
FILTER(node,hi,base+15*mult);
assert(b==nn);
}
#endif
#if 0
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
FineField combined(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
JacobiPoly(hermop,noise,Mn);
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
// subspace[b] = tmp; b++;
// }
}
#define FILTER(lambda) \
{ \
hermop.HermOp(subspace[0],tmp); \
tmp = tmp - lambda *subspace[0]; \
scale = std::pow(norm2(tmp),-0.5); \
tmp=tmp*scale; \
subspace[b] = tmp; \
hermop.Op(subspace[b],tmp); \
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
b++; \
}
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
// subspace[b] = tmp; b++;
// }
FILTER(2.0e-5);
FILTER(2.0e-4);
FILTER(4.0e-4);
FILTER(8.0e-4);
FILTER(8.0e-4);
FILTER(2.0e-3);
FILTER(3.0e-3);
FILTER(4.0e-3);
FILTER(5.0e-3);
FILTER(6.0e-3);
FILTER(2.5e-3);
FILTER(3.5e-3);
FILTER(4.5e-3);
FILTER(5.5e-3);
FILTER(6.5e-3);
// FILTER(6.0e-5);//6
// FILTER(7.0e-5);//8
// FILTER(8.0e-5);//9
// FILTER(9.0e-5);//3
/*
// FILTER(1.0e-4);//10
FILTER(2.0e-4);//11
// FILTER(3.0e-4);//12
// FILTER(4.0e-4);//13
FILTER(5.0e-4);//14
FILTER(6.0e-3);//4
FILTER(7.0e-4);//1
FILTER(8.0e-4);//7
FILTER(9.0e-4);//15
FILTER(1.0e-3);//2
FILTER(2.0e-3);//2
FILTER(3.0e-3);//2
FILTER(4.0e-3);//2
FILTER(5.0e-3);//2
FILTER(6.0e-3);//2
FILTER(7.0e-3);//2
FILTER(8.0e-3);//2
FILTER(1.0e-2);//2
*/
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
assert(b==nn);
}
#endif
}; };
@ -549,13 +293,13 @@ public:
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
autoView( in_v , in, AcceleratorRead);
auto in_v = in.View(); autoView( out_v , out, AcceleratorWrite);
auto out_v = out.View();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@ -572,24 +316,25 @@ public:
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
int lane=SIMTlane(Nsimd);
for(int point=0;point<geom.npoint;point++){ for(int point=0;point<geom.npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) { if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else { } else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
} }
synchronise(); acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) { for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb); res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
} }
} }
coalescedWrite(out_v[ss](b),res,lane); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}; };
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
@ -617,11 +362,11 @@ public:
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View()); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
auto out_v = out.View(); autoView( out_v , out, AcceleratorWrite);
auto in_v = in.View(); autoView( in_v , in, AcceleratorRead);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0])) calcVector;
@ -635,45 +380,21 @@ public:
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
int lane=SIMTlane(Nsimd);
SE=Stencil.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) { if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane); nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else { } else {
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane); nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
} }
synchronise(); acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) { for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb); res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
} }
coalescedWrite(out_v[ss](b),res,lane); coalescedWrite(out_v[ss](b),res);
}); });
#if 0 for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
accelerator_for(ss,Grid()->oSites(),1,{
siteVector res = Zero();
siteVector nbr;
int ptype;
StencilEntry *SE;
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in_v[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
synchronise();
res = res + Aview_p[point][ss]*nbr;
out_v[ss]=res;
});
#endif
} }
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{ {
@ -841,10 +562,10 @@ public:
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi); blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
auto iZProj_v = iZProj.View() ; autoView( iZProj_v , iZProj, AcceleratorRead) ;
auto oZProj_v = oZProj.View() ; autoView( oZProj_v , oZProj, AcceleratorRead) ;
auto A_p = A[p].View(); autoView( A_p , A[p], AcceleratorWrite);
auto A_self = A[self_stencil].View(); autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
@ -860,11 +581,11 @@ public:
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio); mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
{ {
auto tmp_ = tmp.View(); autoView( tmp_ , tmp, AcceleratorWrite);
auto evenmask_ = evenmask.View(); autoView( evenmask_ , evenmask, AcceleratorRead);
auto oddmask_ = oddmask.View(); autoView( oddmask_ , oddmask, AcceleratorRead);
auto Mphie_ = Mphie.View(); autoView( Mphie_ , Mphie, AcceleratorRead);
auto Mphio_ = Mphio.View(); autoView( Mphio_ , Mphio, AcceleratorRead);
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{ accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss)); coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
}); });
@ -872,8 +593,8 @@ public:
blockProject(SelfProj,tmp,Subspace.subspace); blockProject(SelfProj,tmp,Subspace.subspace);
auto SelfProj_ = SelfProj.View(); autoView( SelfProj_ , SelfProj, AcceleratorRead);
auto A_self = A[self_stencil].View(); autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
for(int j=0;j<nbasis;j++){ for(int j=0;j<nbasis;j++){
@ -887,33 +608,8 @@ public:
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl; std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
ForceHermitian(); ForceHermitian();
} }
// AssertHermitian();
// ForceDiagonal();
} }
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< A[p] << std::endl;
}
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
void ForceHermitian(void) { void ForceHermitian(void) {
CoarseMatrix Diff (Grid()); CoarseMatrix Diff (Grid());
for(int p=0;p<geom.npoint;p++){ for(int p=0;p<geom.npoint;p++){
@ -933,27 +629,6 @@ public:
} }
} }
} }
void AssertHermitian(void) {
CoarseMatrix AA (Grid());
CoarseMatrix AAc (Grid());
CoarseMatrix Diff (Grid());
for(int d=0;d<4;d++){
int dd=d+1;
AAc = Cshift(A[2*d+1],dd,1);
AA = A[2*d];
Diff = AA - adj(AAc);
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
}
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
@ -191,7 +189,7 @@ public:
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View(); autoView(pgbuf_v , pgbuf, CpuWrite);
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -232,15 +230,18 @@ public:
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
thread_for(idx, sgrid->lSites(),{ {
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd); Coordinate cbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf); sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf); peekLocalSite(s,r_v,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L; cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L; pokeLocalSite(s,p_v,cbuf);
pokeLocalSite(s,pgbuf,cbuf); });
}); }
if (p != processors[dim] - 1) { if (p != processors[dim] - 1) {
result = Cshift(result,dim,L); result = Cshift(result,dim,L);
} }
@ -269,15 +270,19 @@ public:
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
thread_for(idx,sgrid->lSites(),{ {
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd); Coordinate clbuf(Nd), cgbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf); sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf); peekLocalSite(s,pgbuf_v,cgbuf);
pokeLocalSite(s,result,clbuf); pokeLocalSite(s,result_v,clbuf);
}); });
}
result = result*div; result = result*div;
// destroying plan // destroying plan

View File

@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
LinearCombTimer.Start(); LinearCombTimer.Start();
bo = beta * omega; bo = beta * omega;
auto p_v = p.View(); {
auto r_v = r.View(); autoView( p_v , p, AcceleratorWrite);
auto v_v = v.View(); autoView( r_v , r, AcceleratorRead);
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ autoView( v_v , v, AcceleratorRead);
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
}); coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
alpha = rho / Calpha.real(); alpha = rho / Calpha.real();
LinearCombTimer.Start(); LinearCombTimer.Start();
auto h_v = h.View(); {
auto psi_v = psi.View(); autoView( p_v , p, AcceleratorRead);
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ autoView( r_v , r, AcceleratorRead);
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); autoView( v_v , v, AcceleratorRead);
}); autoView( psi_v,psi, AcceleratorRead);
autoView( h_v , h, AcceleratorWrite);
auto s_v = s.View(); autoView( s_v , s, AcceleratorWrite);
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss)); coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
}); });
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@ -166,11 +172,17 @@ class BiCGSTAB : public OperatorFunction<Field>
omega = Comega.real() / norm2(t); omega = Comega.real() / norm2(t);
LinearCombTimer.Start(); LinearCombTimer.Start();
auto t_v = t.View(); {
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{ autoView( psi_v,psi, AcceleratorWrite);
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss)); autoView( r_v , r, AcceleratorWrite);
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss)); autoView( h_v , h, AcceleratorRead);
}); autoView( s_v , s, AcceleratorRead);
autoView( t_v , t, AcceleratorRead);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
cp = norm2(r); cp = norm2(r);

View File

@ -140,13 +140,15 @@ public:
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
auto psi_v = psi.View(); {
auto p_v = p.View(); autoView( psi_v , psi, AcceleratorWrite);
auto r_v = r.View(); autoView( p_v , p, AcceleratorWrite);
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ autoView( r_v , r, AcceleratorWrite);
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
}); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();

View File

@ -0,0 +1,241 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_NON_HERM_H
#define GRID_PREC_GCR_NON_HERM_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.Op(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
return;
}
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b, zAz;
RealD zAAz;
ComplexD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
LinalgTimer.Start();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -6,93 +6,6 @@ NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr; MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false; bool MemoryProfiler::debug = false;
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
#ifdef GRID_CUDA
int PointerCache::Ncache = 32;
#else
int PointerCache::Ncache = 8;
#endif
int PointerCache::Victim;
int PointerCache::VictimSmall;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
void PointerCache::Init(void)
{
char * str;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) Ncache = atoi(str);
if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) NcacheSmall = atoi(str);
if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
// printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
}
void *PointerCache::Insert(void *ptr,size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
return Insert(ptr,bytes,Entries,Ncache,Victim);
}
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Lookup(bytes,EntriesSmall,NcacheSmall);
return Lookup(bytes,Entries,Ncache);
}
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES) void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__

View File

@ -26,129 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H #pragma once
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
class PointerCache {
private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
/* Could make these configurable, perhaps up to a max size*/
static const int NcacheSmallMax=128;
static const int NcacheMax=16;
static int NcacheSmall;
static int Ncache;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[NcacheMax];
static int Victim;
static PointerCacheEntry EntriesSmall[NcacheSmallMax];
static int VictimSmall;
public:
static void Init(void);
static void *Insert(void *ptr,size_t bytes) ;
static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes) ;
static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#ifdef GRID_NVCC
#define profilerCudaMeminfo \
{ size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
#else
#define profilerCudaMeminfo
#endif
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
} \
profilerCudaMeminfo;
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp> template<typename _Tp>
class alignedAllocator { class alignedAllocator {
public: public:
@ -172,70 +53,60 @@ public:
{ {
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr; return ptr;
} }
void deallocate(pointer __p, size_type __n) { void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes); profilerFree(bytes);
MemoryManager::CpuFree((void *)__p,bytes);
}
#ifdef POINTER_CACHE // FIXME: hack for the copy constructor, eventually it must be avoided
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); //void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
#else void construct(pointer __p, const _Tp& __val) { assert(0);};
pointer __freeme = __p; void construct(pointer __p) { };
#endif void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
#ifdef GRID_NVCC template<typename _Tp>
if ( __freeme ) cudaFree((void *)__freeme); class uvmAllocator {
#else public:
#ifdef HAVE_MM_MALLOC_H typedef std::size_t size_type;
if ( __freeme ) _mm_free((void *)__freeme); typedef std::ptrdiff_t difference_type;
#else typedef _Tp* pointer;
if ( __freeme ) free((void *)__freeme); typedef const _Tp* const_pointer;
#endif typedef _Tp& reference;
#endif typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
uvmAllocator() throw() { }
uvmAllocator(const uvmAllocator&) throw() { }
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
~uvmAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::SharedFree((void *)__p,bytes);
} }
// FIXME: hack for the copy constructor, eventually it must be avoided // FIXME: hack for the copy constructor, eventually it must be avoided
@ -244,17 +115,17 @@ public:
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>; template<class T> using commAllocator = uvmAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using commVector = std::vector<T,alignedAllocator<T> >; template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; //template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,4 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/MemoryManager.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@ -0,0 +1,244 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
}
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
}
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
}
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
}
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
}
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
total_host-=bytes;
}
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
}
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
total_host-=bytes;
}
}
#endif
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void MemoryManager::Init(void)
{
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
Ncache[Shared]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc;
}
}
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
#endif
#ifdef GRID_UVM
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,181 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryManager.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <list>
#include <unordered_map>
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Regular data
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class MemoryManager {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void PrintBytes(void);
public:
static void Init(void);
static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
static uint64_t DeviceBytes;
static uint64_t DeviceLRUBytes;
static uint64_t DeviceMaxBytes;
static uint64_t HostToDeviceBytes;
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
typedef std::list<uint64_t> LRU_t;
typedef typename LRU_t::iterator LRUiterator;
typedef struct {
int LRU_valid;
LRUiterator LRU_entry;
uint64_t CpuPtr;
uint64_t AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
static AccViewTable_t AccViewTable;
static LRU_t LRU;
/////////////////////////////////////////////////
// Device motion
/////////////////////////////////////////////////
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
static void Evict(AcceleratorViewEntry &AccCache);
static void Flush(AcceleratorViewEntry &AccCache);
static void Clone(AcceleratorViewEntry &AccCache);
static void AccDiscard(AcceleratorViewEntry &AccCache);
static void CpuDiscard(AcceleratorViewEntry &AccCache);
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table
static int EntryPresent(uint64_t CpuPtr);
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EntryErase (uint64_t CpuPtr);
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
static void AcceleratorViewClose(uint64_t AccPtr);
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
static void NotifyDeletion(void * CpuPtr);
public:
static void Print(void);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,468 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
MemoryManager::LRU_t MemoryManager::LRU;
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
/////////////////////////////////////////////////
// Mechanics of data table maintenance
/////////////////////////////////////////////////
int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty;
AccCache.LRU_valid=0;
AccCache.transient=0;
AccCache.accLock=0;
AccCache.cpuLock=0;
AccViewTable[CpuPtr] = AccCache;
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
{
auto AccCache = EntryLookup(CpuPtr);
AccViewTable.erase(CpuPtr);
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
} else {
LRU.push_front(AccCache.CpuPtr);
AccCache.LRU_entry = LRU.begin();
}
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
}
/////////////////////////////////////////////////
// Accelerator cache motion & consistency logic
/////////////////////////////////////////////////
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
AccCache.state=Consistent;
}
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
AccCache.state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return NULL;
}
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
}
}
}
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Cpu starts primary
if(mode==AcceleratorWriteDiscard){
CpuDiscard(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
LRUremove(AccCache);
}
int transient =hint;
AccCache.transient= transient? EvictNext : 0;
return AccCache.AccPtr;
}
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
LRUinsert(AccCache);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
AccCache.cpuLock--;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache.accLock= 0;
AccCache.cpuLock= 1;
} else if(AccCache.state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)
{
// Look up in ViewCache
uint64_t ptr = (uint64_t)_ptr;
if(EntryPresent(ptr)) {
auto e = EntryLookup(ptr);
AccDiscard(e->second);
}
}
void MemoryManager::Print(void)
{
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
return AccCache.cpuLock+AccCache.accLock;
} else {
return 0;
}
}
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,24 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
#warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,67 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@ -0,0 +1,95 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);

View File

@ -81,6 +81,7 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask;
public: public:

View File

@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@ -104,6 +105,7 @@ public:
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@ -114,6 +116,8 @@ public:
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
_checker_dim_mask[d]=0;
_fdimensions[d] = dimensions[d]; // Global dimensions _fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];

View File

@ -36,11 +36,27 @@ static const int CbBlack=1;
static const int Even =CbRed; static const int Even =CbRed;
static const int Odd =CbBlack; static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);
Lexicographic::CoorFromIndex(coor,oindex,rdim);
int linear=0;
for(int d=0;d<nd;d++){
if(chk_dim_msk[d])
linear=linear+coor[d];
}
return (linear&0x1);
}
// Specialise this for red black grids storing half the data like a chess board. // Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
#ifdef GRID_NVCC #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif #endif
@ -170,17 +170,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
std::vector<int> primes({2,3,5}); std::vector<int> primes({2,3,5});
int dim = 0; int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1; int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) { while(AutoShmSize != WorldShmSize) {
for(int p=0;p<primes.size();p++) { int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p]; int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim]) if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) { && divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime; AutoShmSize*=prime;
ShmDims[dim]*=prime; ShmDims[dim]*=prime;
last_dim = dim;
break; break;
} }
} }
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension; dim=(dim+1) %ndimension;
} }
} }
@ -413,7 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC #ifdef GRID_CUDA
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -433,13 +440,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -677,7 +677,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
#ifdef GRID_NVCC #ifdef GRID_CUDA
cudaMemset(dest,0,bytes); cudaMemset(dest,0,bytes);
#else #else
bzero(dest,bytes); bzero(dest,bytes);
@ -685,7 +685,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{ {
#ifdef GRID_NVCC #ifdef GRID_CUDA
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault); cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else #else
bcopy(src,dest,bytes); bcopy(src,dest,bytes);

View File

@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0; int ent = 0;
static Vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int bo = n*e2; int bo = n*e2;
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
} }
} }
} else { } else {
@ -65,14 +67,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int o = n*stride; int o = n*stride;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
table[ent++]=std::pair<int,int> (off+bo++,so+o+b); Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
thread_for(i,ent,{ {
buffer[table[i].first]=rhs_v[table[i].second]; autoView(rhs_v , rhs, AcceleratorRead);
}); auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
}
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@ -95,36 +102,38 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
thread_for_collapse(2,n,e1,{ autoView(rhs_v , rhs, AcceleratorRead);
for(int b=0;b<e2;b++){ accelerator_for2d(n,e1,b,e2,1,{
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} });
});
} else { } else {
autoView(rhs_v , rhs, AcceleratorRead);
// Case of SIMD split AND checker dim cannot currently be hit, except in Coordinate rdim=rhs.Grid()->_rdimensions;
// Test_cshift_red_black code. Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
thread_for_collapse(2,n,e1,{ accelerator_for2d(n,e1,b,e2,1,{
for(int b=0;b<e2;b++){
Coordinate coor;
int o=n*n1; int o=n*n1;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
} });
});
} }
} }
@ -145,7 +154,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent =0; int ent =0;
if ( cbmask ==0x3 ) { if ( cbmask ==0x3 ) {
@ -154,7 +164,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b); Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@ -165,16 +175,20 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
table[ent++]=std::pair<int,int> (so+o+b,bo++); Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
thread_for(i,ent,{ autoView( rhs_v, rhs, AcceleratorWrite);
rhs_v[table[i].first]=buffer[table[i].second]; auto buffer_p = & buffer[0];
}); auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -194,21 +208,19 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorWrite);
thread_for_collapse(2,n,e1,{ accelerator_for2d(n,e1,b,e2,1,{
for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension]; int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
} });
});
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
@ -225,6 +237,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@ -239,14 +252,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} else { } else {
@ -255,23 +270,24 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} }
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); autoView(rhs_v , rhs, AcceleratorRead);
thread_for(i,ent,{ autoView(lhs_v , lhs, AcceleratorWrite);
lhs_v[table[i].first]=rhs_v[table[i].second]; auto table = &Cshift_table[0];
}); accelerator_for(i,ent,1,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
}
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@ -285,29 +301,33 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs.Grid()->_slice_block [dimension]; int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2); if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
auto rhs_v = rhs.View(); {
auto lhs_v = lhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
thread_for(i,ent,{ autoView( lhs_v, lhs, AcceleratorWrite);
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); auto table = &Cshift_table[0];
}); accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////

View File

@ -0,0 +1,4 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
NAMESPACE_END(Grid);

View File

@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h> #include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h> #include <Grid/lattice/Lattice_ET.h>

View File

@ -92,12 +92,18 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{ {
return arg[ss]; return arg[ss];
} }
// What needs this?
// Cannot be legal on accelerator
// Comparison must convert
#if 1
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{ {
auto view = arg.AcceleratorView(ViewRead); auto view = arg.View(AcceleratorRead);
return view[ss]; return view[ss];
} }
#endif
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand // handle nodes in syntax tree- eval one operand
@ -180,16 +186,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf
{
}
template <typename Op, typename T1> inline template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{ {
CBFromExpression(cb, expr.arg1); // recurse AST CBFromExpression(cb, expr.arg1); // recurse AST
} }
template <typename Op, typename T1, typename T2> inline template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{ {
@ -204,6 +206,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
CBFromExpression(cb, expr.arg3); // recurse AST CBFromExpression(cb, expr.arg3); // recurse AST
} }
//////////////////////////////////////////////////////////////////////////
// ViewOpen
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{
lat.ViewOpen(AcceleratorRead);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
ExpressionViewOpen(expr.arg3); // recurse AST
}
//////////////////////////////////////////////////////////////////////////
// ViewClose
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{
lat.ViewClose();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
ExpressionViewClose(expr.arg3); // recurse AST
}
//////////////////////////////////////////// ////////////////////////////////////////////
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////

View File

@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@ -56,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs); mult(&tmp,&lhs_v(ss),&rhs);
@ -121,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -179,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto rhs_v = lhs.AcceleratorView(ViewRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -221,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.AcceleratorView(ViewRead); autoView( x_v , x, AcceleratorRead);
auto y_v = y.AcceleratorView(ViewRead); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss); auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.AcceleratorView(ViewWrite); autoView( ret_v , ret, AcceleratorWrite);
auto x_v = x.AcceleratorView(ViewRead); autoView( x_v , x, AcceleratorRead);
auto y_v = y.AcceleratorView(ViewRead); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@ -29,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#define STREAMING_STORES #define STREAMING_STORES
@ -37,180 +38,6 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum LatticeAcceleratorAdvise {
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
ViewRead = 0x1,
ViewWrite = 0x2,
ViewReadWrite = 0x3
};
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
accelerator_inline void Advise(int advise) {
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
if (advise & AdviseInfrequentUse) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
}
if (advise & AdviseReadMostly) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
}
#endif
#endif
};
accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
int target;
cudaGetDevice(&target);
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
#endif
#endif
};
accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
#endif
#endif
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics. // The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code // This contains extra (host resident) grid pointer data that may be accessed by host code
@ -253,28 +80,23 @@ private:
} }
} }
public: public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
{ // and HostView for thread_for
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const LatticeView<vobj> View (ViewMode mode) const
{ {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this)); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.AcceleratorPrefetch(mode);
return accessor;
}
LatticeView<vobj> HostView(int mode = ViewReadWrite) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.HostPrefetch(mode);
return accessor; return accessor;
} }
@ -298,11 +120,15 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite); auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@ -317,11 +143,15 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite); auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@ -335,11 +165,15 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = AcceleratorView(ViewWrite); auto exprCopy = expr;
ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,exprCopy);
vstream(me[ss],tmp); vstream(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
@ -390,10 +224,11 @@ public:
} }
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View(); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss] = r; me[ss]= r;
}); });
me.ViewClose();
return *this; return *this;
} }
@ -403,11 +238,12 @@ public:
/////////////////////////////////////////// ///////////////////////////////////////////
// user defined constructor // user defined constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode);
} }
// virtual ~Lattice(void) = default; // virtual ~Lattice(void) = default;
@ -445,11 +281,12 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = AcceleratorView(ViewWrite); auto me = View(AcceleratorWriteDiscard);
auto him= r.AcceleratorView(ViewRead); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
@ -459,11 +296,12 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = AcceleratorView(ViewWrite); auto me = View(AcceleratorWriteDiscard);
auto him= r.AcceleratorView(ViewRead); auto him= r.View(AcceleratorRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
/////////////////////////////////////////// ///////////////////////////////////////////

View File

@ -51,34 +51,18 @@ template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View()) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
Vector<View> basis_v(basis.size(),tmp_v); Vector<View> basis_v; basis_v.reserve(basis.size());
typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewReadWrite); basis_v.push_back(basis[k].View(AcceleratorWrite));
} }
#ifndef GRID_NVCC
thread_region
{
std::vector < vobj > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){ View *basis_vp = &basis_v[0];
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
return; return;
@ -86,6 +70,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites(); uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
Vector <vobj> Bt(siteBlock * nrot); Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0]; auto Bp=&Bt[0];
@ -96,7 +82,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
Qt_p[i]=Qt(j,k); Qt_p[i]=Qt(j,k);
}); });
// Block the loop to keep storage footprint down // Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){ for(uint64_t s=0;s<oSites;s+=siteBlock){
@ -132,27 +118,30 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
template<class Field> template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].AcceleratorView()) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.AcceleratorView(ViewWrite);
Vector<View> basis_v(basis.size(),result_v); Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewRead); basis_v.push_back(basis[k].View(AcceleratorRead));
} }
vobj zz=Zero(); vobj zz=Zero();
Vector<double> Qt_jv(Nm); Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k); for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz); auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
@ -160,6 +149,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
template<class Field> template<class Field>

View File

@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
}); });
@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vPredicate> ret(lhs.Grid()); Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs); ret_v[ss]=op(lhs_v[ss],rhs);
}); });
@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, CpuRead);
auto ret_v = ret.View(); autoView( ret_v, ret, CpuWrite);
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]); ret_v[ss]=op(lhs,rhs_v[ss]);
}); });

View File

@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid(); GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
auto l_v = l.View(); autoView(l_v, l, CpuWrite);
thread_for( o, grid->oSites(), { thread_for( o, grid->oSites(), {
vector_type vI; vector_type vI;
Coordinate gcoor; Coordinate gcoor;
@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
}); });
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
}); });
@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
}); });
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr; typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, AcceleratorRead);
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),1,{ accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer // FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop // Use vector [] operator and don't read coalesce this loop

View File

@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto Y_v = Y.View(); autoView( Y_v , Y, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View(); autoView( X_v , X, CpuRead);
auto R_v = R.View(); autoView( R_v , R, CpuWrite);
thread_region thread_region
{ {
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, CpuRead);
auto rhs_v = rhs.View(); autoView( rhs_v , rhs, CpuRead);
thread_region { thread_region {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);

View File

@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
}); });
return ret; return ret;
@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
}); });
return ret; return ret;
@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
}); });
} }
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{ {
auto rhs_v = rhs.View(); autoView( rhs_v, rhs, AcceleratorRead);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorWrite);
thread_for( ss, lhs_v.size(), { accelerator_for( ss, lhs_v.size(), 1, {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
}); });
} }
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf); extract(l_v[odx],buf);
buf[idx] = s; buf[idx] = s;
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View(); autoView( l_v , l, CpuWrite);
extract(l_v[odx],buf); extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return; return;
}; };
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Must be CPU read view
template<class vobj,class sobj> template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid = l.Grid(); GridBase *grid = l.getGrid();
assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@ -173,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@ -183,18 +182,19 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
return; return;
}; };
// Must be CPU write view
template<class vobj,class sobj> template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{
GridBase *grid=l.Grid(); GridBase *grid=l.getGrid();
assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@ -202,13 +202,11 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
auto l_v = l.View(); scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; vp[idx+w*Nsimd] = pt[w];
} }
return; return;
}; };

View File

@ -40,9 +40,11 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss))); coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}); });
@ -51,9 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
}); });

View File

@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h> #include <Grid/lattice/Lattice_reduction_gpu.h>
#endif #endif
@ -39,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
@ -63,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum; typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{ {
#ifdef GRID_NVCC #if defined(GRID_CUDA)||defined(GRID_HIP)
return sum_gpu(arg,osites); return sum_gpu(arg,osites);
#else #else
return sum_cpu(arg,osites); return sum_cpu(arg,osites);
#endif #endif
} }
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
auto arg_v = arg.View(); #if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites); auto ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
@ -102,42 +151,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC // Might make all code paths go this way.
// GPU - SIMT lane compliance... typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
accelerator_for( ss, sites, nsimd,{ // GPU - SIMT lane compliance...
auto x_l = left_v(ss); accelerator_for( ss, sites, 1,{
auto y_l = right_v(ss); auto x_l = left_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l)); auto y_l = right_v[ss];
}) inner_tmp_v[ss]=innerProductD(x_l,y_l);
});
}
// This is in single precision and fails some tests // This is in single precision and fails some tests
// Need a sumD that sums in double auto anrm = sum(inner_tmp_v,sites);
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites)); nrm = anrm;
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
return nrm; return nrm;
} }
@ -175,40 +211,24 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
auto x_v=x.AcceleratorView(ViewRead);
auto y_v=y.AcceleratorView(ViewRead);
auto z_v=z.AcceleratorView(ViewWrite);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; autoView( x_v, x, AcceleratorRead);
Vector<inner_t> inner_tmp(sites); autoView( y_v, y, AcceleratorRead);
auto inner_tmp_v = &inner_tmp[0]; autoView( z_v, z, AcceleratorWrite);
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v[ss]+b*y_v[ss];
inner_tmp_v[ss]=innerProductD(tmp,tmp); inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp; z_v[ss]=tmp;
}); });
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites))); nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
@ -224,47 +244,29 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
auto left_v=left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites); Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{
autoView(left_v,left, AcceleratorRead);
autoView(right_v,right,AcceleratorRead);
accelerator_for( ss, sites, 1,{
auto left_tmp = left_v[ss];
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
});
}
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
});
tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
});
// Already promoted to double
tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
#endif
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0]; ip = tmp[0];
nrm = real(tmp[1]); nrm = real(tmp[1]);
@ -335,7 +337,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
auto Data_v=Data.View(); autoView( Data_v, Data, CpuRead);
thread_for( r,rd, { thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
@ -413,8 +415,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View(); autoView( lhv, lhs, CpuRead);
auto rhv=rhs.View(); autoView( rhv, rhs, CpuRead);
thread_for( r,rd,{ thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@ -521,14 +523,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
auto Rv=R.View(); autoView( Rv, R, CpuWrite);
auto Xv=X.View(); autoView( Xv, X, CpuRead);
auto Yv=Y.View(); autoView( Yv, Y, CpuRead);
thread_for_collapse(2, n, e1, { thread_for2d( n, e1, b,e2, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
}
}); });
} }
}; };
@ -581,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View(); autoView( X_v, X, CpuRead);
auto Y_v=Y.View(); autoView( Y_v, Y, CpuRead);
auto R_v=R.View(); autoView( R_v, R, CpuWrite);
thread_region thread_region
{ {
Vector<vobj> s_x(Nblock); Vector<vobj> s_x(Nblock);
@ -628,13 +628,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog]; int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View(); autoView( R_v, R, CpuWrite);
auto X_v = X.View(); autoView( X_v, X, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@ -692,8 +693,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View(); autoView( lhs_v, lhs, CpuRead);
auto rhs_v=rhs.View(); autoView( rhs_v, rhs, CpuRead);
thread_region thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);

View File

@ -1,7 +1,13 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32 #ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props;
#endif
#ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props; extern cudaDeviceProp *gpu_props;
#endif
#define WARP_SIZE 32
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@ -19,7 +25,12 @@ template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device; int device;
#ifdef GRID_CUDA
cudaGetDevice(&device); cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize; Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
sobj *smem = (sobj *)shmem_pointer; sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished // wait until all outstanding memory instructions in this thread are finished
__threadfence(); acceleratorFence();
if (tid==0) { if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x); unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
@ -156,7 +167,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
} }
// each thread must read the correct value of amLast // each thread must read the correct value of amLast
__syncthreads(); acceleratorSynchroniseAll();
if (amLast) { if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1] // reduce buffer[0], ..., buffer[gridDim.x-1]
@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize(); accelerator_barrier();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0]; auto result = buffer_v[0];
return result; return result;
} }

View File

@ -375,7 +375,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View(); autoView(l_v, l, CpuWrite);
thread_for( ss, osites, { thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd); ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
@ -462,7 +462,7 @@ public:
{ {
// Obtain one reseeded generator per thread // Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads(); int Nthread = 32; // Hardwire a good level or parallelism
std::vector<RngEngine> seeders(Nthread); std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){ for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine); seeders[t] = Reseed(master_engine);

View File

@ -42,8 +42,8 @@ template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView(ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView(lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss))); coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
}); });
@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v , ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
}); });

View File

@ -47,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{
half.Checkerboard() = cb; half.Checkerboard() = cb;
auto half_v = half.View(); autoView( half_v, half, CpuWrite);
auto full_v = full.View(); autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{ thread_for(ss, full.Grid()->oSites(),{
int cbos; int cbos;
Coordinate coor; Coordinate coor;
@ -64,11 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
} }
}); });
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ {
int cb = half.Checkerboard(); int cb = half.Checkerboard();
auto half_v = half.View(); autoView( half_v , half, CpuRead);
auto full_v = full.View(); autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{ thread_for(ss,full.Grid()->oSites(),{
Coordinate coor; Coordinate coor;
@ -96,15 +97,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
out = in; out = in;
} }
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in; ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
} }
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in; ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
} }
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in; ((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
} }
#endif #endif
@ -151,12 +152,11 @@ accelerator_inline void convertType(T & out, const T & in) {
template<typename T1,typename T2> template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) { accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
auto out_v = out.AcceleratorView(ViewWrite); autoView( out_v , out,AcceleratorWrite);
auto in_v = in.AcceleratorView(ViewRead); autoView( in_v , in ,AcceleratorRead);
accelerator_for(ss,out_v.size(),T1::Nsimd(),{ accelerator_for(ss,out_v.size(),T1::Nsimd(),{
convertType(out_v[ss],in_v(ss)); convertType(out_v[ss],in_v(ss));
}); });
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
@ -164,19 +164,20 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>> -> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
{ {
auto lhs_v = lhs.AcceleratorView(ViewRead); autoView( lhs_v , lhs, AcceleratorRead);
auto rhs_v = rhs.AcceleratorView(ViewRead); autoView( rhs_v , rhs, AcceleratorRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid()); Lattice<iScalar<t_inner>> ret(lhs.Grid());
auto ret_v = ret.AcceleratorView(ViewWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ {
autoView(ret_v, ret,AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
}); });
}
return ret; return ret;
} }
@ -194,14 +195,13 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<iScalar<CComplex>> ip(coarse); Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData; Lattice<vobj> fineDataRed = fineData;
// auto fineData_ = fineData.View(); autoView( coarseData_ , coarseData, AcceleratorWrite);
auto coarseData_ = coarseData.AcceleratorView(ViewWrite); autoView( ip_ , ip, AcceleratorWrite);
auto ip_ = ip.AcceleratorView(ViewReadWrite);
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine> blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]); convertType(coarseData_[sc](v),ip_[sc]);
}); });
// improve numerical stability of projection // improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis> // |fine> = |fine> - <basis|fine> |basis>
@ -210,68 +210,6 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
} }
template<class vobj,class CComplex,int nbasis>
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis)
{
typedef iVector<CComplex,nbasis > coarseSiteData;
coarseSiteData elide;
typedef decltype(coalescedRead(elide)) ScalarComplex;
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension;
// checks
assert( nbasis == Basis.size() );
subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){
conformable(Basis[i],fineData);
}
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
}
int blockVol = fine->oSites()/coarse->oSites();
coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
////////////////////////////////////////////////////////////////////////////////////////////////////////
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
// Otherwise do fine inner product per site, and make the update atomic
////////////////////////////////////////////////////////////////////////////////////////////////////////
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
auto sc=sci/nbasis;
auto i=sci%nbasis;
auto Basis_ = Basis[i].View();
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
int sf;
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
for(int sb=0;sb<blockVol;sb++){
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
}
coalescedWrite(coarseData_[sc](i),reduce);
});
return;
}
template<class vobj,class vobj2,class CComplex> template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -298,10 +236,10 @@ template<class vobj,class vobj2,class CComplex>
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
auto fineZ_ = fineZ.AcceleratorView(ViewWrite); autoView( fineZ_ , fineZ, AcceleratorWrite);
auto fineX_ = fineX.AcceleratorView(ViewRead); autoView( fineX_ , fineX, AcceleratorRead);
auto fineY_ = fineY.AcceleratorView(ViewRead); autoView( fineY_ , fineY, AcceleratorRead);
auto coarseA_= coarseA.AcceleratorView(ViewRead); autoView( coarseA_, coarseA, AcceleratorRead);
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@ -314,7 +252,7 @@ template<class vobj,class vobj2,class CComplex>
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
// z = A x + y // z = A x + y
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
typename vobj2::tensor_reduced::scalar_object cA; typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx; typename vobj::scalar_object cAx;
#else #else
@ -344,15 +282,16 @@ template<class vobj,class CComplex>
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
// Precision promotion // Precision promotion
fine_inner = localInnerProductD(fineX,fineY); fine_inner = localInnerProductD<vobj>(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
accelerator_for(ss, coarse->oSites(), 1, { {
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
}); });
}
} }
@ -370,14 +309,15 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
accelerator_for(ss, coarse->oSites(), 1, { {
CoarseInner_[ss] = coarse_inner_[ss]; autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
}); autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss];
});
}
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
@ -408,8 +348,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
} }
int blockVol = fine->oSites()/coarse->oSites(); int blockVol = fine->oSites()/coarse->oSites();
auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite); // Turn this around to loop threaded over sc and interior loop
auto fineData_ = fineData.AcceleratorView(ViewRead); // over sf would thread better
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( fineData_ , fineData, AcceleratorRead);
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
@ -510,8 +452,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
auto fineData_ = fineData.View(); autoView( fineData_ , fineData, AcceleratorWrite);
auto coarseData_ = coarseData.View(); autoView( coarseData_ , coarseData, AcceleratorRead);
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{ accelerator_for(sf,fine->oSites(),1,{
@ -524,7 +466,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
auto basis_ = Basis[i].View(); /* auto basis_ = Basis[i], );*/
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]); if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]); else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
} }
@ -543,7 +485,14 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
fineData=Zero(); fineData=Zero();
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i); Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
auto ip_ = ip.AcceleratorView(ViewRead);
//Lattice<CComplex> cip(coarse);
//autoView( cip_ , cip, AcceleratorWrite);
//autoView( ip_ , ip, AcceleratorRead);
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
// coalescedWrite(cip_[sc], ip_(sc)());
// });
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
blockZAXPY(fineData,ip,Basis[i],fineData); blockZAXPY(fineData,ip,Basis[i],fineData);
} }
} }
@ -571,15 +520,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead);
autoView(out_v,out,CpuWrite);
thread_for(idx, ig->lSites(),{ thread_for(idx, ig->lSites(),{
sobj s; sobj s;
ssobj ss; ssobj ss;
Coordinate lcoor(ni); Coordinate lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor); ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in,lcoor); peekLocalSite(s,in_v,lcoor);
ss=s; ss=s;
pokeLocalSite(ss,out,lcoor); pokeLocalSite(ss,out_v,lcoor);
}); });
} }
@ -614,8 +565,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions; Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
auto t_v = To.AcceleratorView(ViewWrite);
auto f_v = From.AcceleratorView(ViewRead); autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{ accelerator_for(idx,Fg->lSites(),1,{
sobj s; sobj s;
Coordinate Fcoor(nd); Coordinate Fcoor(nd);
@ -638,8 +590,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
} }
// peekLocalSite(s,From,Fcoor);
// pokeLocalSite(s,To ,Tcoor);
} }
}); });
} }
@ -670,6 +620,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -682,8 +634,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
}); });
} }
@ -711,6 +663,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -723,8 +677,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
}); });
} }
@ -752,6 +706,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -760,8 +716,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDim,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDim,hcoor); pokeLocalSite(s,higherDimv,hcoor);
} }
}); });
} }
@ -789,6 +745,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@ -797,8 +755,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDim,hcoor); peekLocalSite(s,higherDimv,hcoor);
pokeLocalSite(s,lowDim,lcoor); pokeLocalSite(s,lowDimv,lcoor);
} }
}); });
} }
@ -862,7 +820,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
} }
//loop over outer index //loop over outer index
auto in_v = in.View(); autoView( in_v , in, CpuRead);
thread_for(in_oidx,in_grid->oSites(),{ thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd); ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@ -955,7 +913,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for(oidx, grid->oSites(),{ thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd); ExtractPointerArray<sobj> ptrs(nsimd);
@ -1058,7 +1016,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
std::vector<SobjOut> in_slex_conv(in_grid->lSites()); std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in); unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{ thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim); Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx); out_grid->oCoorFromOindex(out_ocoor, out_oidx);

View File

@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
}); });
@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View(); autoView( ret_v, ret, AcceleratorWrite);
auto lhs_v = lhs.View(); autoView( lhs_v, lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
}); });

View File

@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs, rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret, ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{ accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y); ret[ss]=pow(rhs[ss],y);
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y)); coalescedWrite(ret[ss],mod(rhs(ss),y));
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
ret.Checkerboard() = rhs_i.Checkerboard(); ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y)); coalescedWrite(ret[ss],div(rhs(ss),y));
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View(); autoView( rhs , rhs_i, AcceleratorRead);
auto ret = ret_i.View(); autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));

168
Grid/lattice/Lattice_view.h Normal file
View File

@ -0,0 +1,168 @@
#pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
//public:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
ViewAdvise advise;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline ViewAdvise Advise(void) const { return advise; };
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
// Host only
GridBase * getGrid(void) const { return _grid; };
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
ViewMode mode;
void * cpu_ptr;
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
return coalescedRead(this->_odata[i]);
}
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
{
this->ViewOpen(mode);
}
// Host functions
void ViewOpen(ViewMode mode)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
this->cpu_ptr = (void *)this->_odata;
this->mode = mode;
this->_odata =(vobj *)
MemoryManager::ViewOpen(this->cpu_ptr,
this->_odata_size*sizeof(vobj),
mode,
this->advise);
}
void ViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
ViewCloser(View &_v) : v(_v) {};
~ViewCloser() { v.ViewClose(); }
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
NAMESPACE_END(Grid);

View File

@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
#ifdef GRID_NVCC #ifdef GRID_CUDA
accelerator_inline uint64_t __rdtsc(void) { return 0; } accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; } accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else #else
@ -112,7 +112,6 @@ class PerformanceCounter {
private: private:
typedef struct { typedef struct {
public:
uint32_t type; uint32_t type;
uint64_t config; uint64_t config;
const char *name; const char *name;

View File

@ -12773,7 +12773,7 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_NVCC #ifdef GRID_CUDA
#pragma pop #pragma pop
#endif #endif

View File

@ -115,18 +115,21 @@ public:
PokeIndex<LorentzIndex>(Uadj, U, mu); PokeIndex<LorentzIndex>(Uadj, U, mu);
} }
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { autoView(Umu_v,Umu,CpuRead);
autoView(Uadj_v,Uadj,CpuRead);
autoView(Uds_v,Uds,CpuWrite);
thread_for( lidx, GaugeGrid->lSites(), {
Coordinate lcoor; Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu, lcoor); peekLocalSite(ScalarUmu, Umu_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj, lcoor); peekLocalSite(ScalarUmu, Uadj_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds, lcoor); pokeLocalSite(ScalarUds, Uds_v, lcoor);
} });
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)

View File

@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types #include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D); NAMESPACE_CHECK(Wilson5D);
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
NAMESPACE_CHECK(Staggered); NAMESPACE_CHECK(Staggered);
@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_NVCC #ifndef GRID_CUDA
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR; typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF; typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD; typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;

View File

@ -96,11 +96,11 @@ public:
int sl = St._simd_layout[direction]; int sl = St._simd_layout[direction];
Coordinate icoor; Coordinate icoor;
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
_Spinor tmp; _Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd(); const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = SIMTlane(Nsimd); int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s); St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd; int mmu = mu % Nd;
@ -233,14 +233,16 @@ public:
Uconj = where(coor==neglink,-Uconj,Uconj); Uconj = where(coor==neglink,-Uconj,Uconj);
} }
auto U_v = U.View(); {
auto Uds_v = Uds.View(); autoView( U_v , U, CpuRead);
auto Uconj_v = Uconj.View(); autoView( Uconj_v , Uconj, CpuRead);
auto Utmp_v= Utmp.View(); autoView( Uds_v , Uds, CpuWrite);
thread_foreach(ss,U_v,{ autoView( Utmp_v, Utmp, CpuWrite);
Uds_v[ss](0)(mu) = U_v[ss](); thread_foreach(ss,U_v,{
Uds_v[ss](1)(mu) = Uconj_v[ss](); Uds_v[ss](0)(mu) = U_v[ss]();
}); Uds_v[ss](1)(mu) = Uconj_v[ss]();
});
}
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
Uconj = adj(Cshift(Uconj,mu,-1)); Uconj = adj(Cshift(Uconj,mu,-1));
@ -250,19 +252,25 @@ public:
Utmp = where(coor==0,Uconj,Utmp); Utmp = where(coor==0,Uconj,Utmp);
} }
thread_foreach(ss,Utmp_v,{ {
Uds_v[ss](0)(mu+4) = Utmp_v[ss](); autoView( Uds_v , Uds, CpuWrite);
}); autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
});
}
Utmp = Uconj; Utmp = Uconj;
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
Utmp = where(coor==0,U,Utmp); Utmp = where(coor==0,U,Utmp);
} }
thread_foreach(ss,Utmp_v,{ {
Uds_v[ss](1)(mu+4) = Utmp_v[ss](); autoView( Uds_v , Uds, CpuWrite);
}); autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
});
}
} }
} }
@ -272,11 +280,14 @@ public:
GaugeLinkField link(mat.Grid()); GaugeLinkField link(mat.Grid());
// use lorentz for flavour as hack. // use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
auto link_v = link.View();
auto tmp_v = tmp.View(); {
thread_foreach(ss,tmp_v,{ autoView( link_v , link, CpuWrite);
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1)); autoView( tmp_v , tmp, CpuRead);
}); thread_foreach(ss,tmp_v,{
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
});
}
PokeIndex<LorentzIndex>(mat, link, mu); PokeIndex<LorentzIndex>(mat, link, mu);
return; return;
} }
@ -306,16 +317,18 @@ public:
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); {
auto Atilde_v = Atilde.View(); autoView( tmp_v , tmp, CpuWrite);
auto Btilde_v = Btilde.View(); autoView( Atilde_v , Atilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{ autoView( Btilde_v , Btilde, CpuRead);
for (int s = 0; s < Ls; s++) { thread_for(ss,tmp.Grid()->oSites(),{
int sF = s + Ls * ss; for (int s = 0; s < Ls; s++) {
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF])); int sF = s + Ls * ss;
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1)); auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
} tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}); }
});
}
PokeIndex<LorentzIndex>(mat, tmp, mu); PokeIndex<LorentzIndex>(mat, tmp, mu);
return; return;
} }

View File

@ -61,8 +61,8 @@ public:
double DhopCalls; double DhopCalls;
double DhopCommTime; double DhopCommTime;
double DhopComputeTime; double DhopComputeTime;
double DhopComputeTime2; double DhopComputeTime2;
double DhopFaceTime; double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base

View File

@ -0,0 +1,194 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
#define GRID_QCD_NAIVE_STAG_FERMION_H
NAMESPACE_BEGIN(Grid);
class NaiveStaggeredFermionStatic {
public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 8;
};
template <class Impl>
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
void M(const FermionField &in, FermionField &out);
void Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out);
void MeooeDag(const FermionField &in, FermionField &out);
void Mooee(const FermionField &in, FermionField &out);
void MooeeDag(const FermionField &in, FermionField &out);
void MooeeInv(const FermionField &in, FermionField &out);
void MooeeInvDag(const FermionField &in, FermionField &out);
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop (const FermionField &in, FermionField &out, int dag);
void DhopOE(const FermionField &in, FermionField &out, int dag);
void DhopEO(const FermionField &in, FermionField &out, int dag);
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl &st,
DoubledGaugeField &U,
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
NaiveStaggeredFermion(GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGauge (const GaugeField &_U );
DoubledGaugeField &GetU(void) { return Umu ; } ;
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;
GridBase *_grid;
GridBase *_cbgrid;
// Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &srct,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
NAMESPACE_END(Grid);
#endif

View File

@ -47,23 +47,34 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base; typedef FermionOperator<Impl> Base;
public: public:
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp); DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
protected:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels // Generic Nc kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
@ -71,15 +82,18 @@ public:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels // Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, template<int Naik> accelerator_inline
void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
@ -87,27 +101,10 @@ public:
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels // Asm Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
public: public:

View File

@ -113,20 +113,7 @@ public:
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu) inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
{ {
GridBase *GaugeGrid = U_ds.Grid(); assert(0);
thread_for(lidx, GaugeGrid->lSites(),{
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, U_ds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
});
} }
inline void DoubleStore(GridBase *GaugeGrid, inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term DoubledGaugeField &UUUds, // for Naik term

View File

@ -257,15 +257,16 @@ private:
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12 // eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices // using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F) CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{ {
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v,T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = timesMinusI(F_v[i]()()); T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()()); T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
@ -281,9 +282,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v, T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v, F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = -F_v[i]()(); T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()(); T_v[i]()(1, 0) = F_v[i]()();
@ -299,9 +300,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView(T_v,T,AcceleratorWrite);
auto F_v = F.View(); autoView(F_v,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 0) = timesMinusI(F_v[i]()()); T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()()); T_v[i]()(1, 1) = timesI(F_v[i]()());
@ -317,9 +318,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v , T, AcceleratorWrite);
auto F_v = F.View(); autoView( F_v , F, AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = timesI(F_v[i]()()); T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()()); T_v[i]()(1, 0) = timesI(F_v[i]()());
@ -335,9 +336,9 @@ private:
CloverFieldType T(F.Grid()); CloverFieldType T(F.Grid());
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v ,T,AcceleratorWrite);
auto F_v = F.View(); autoView( F_v ,F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 1) = -(F_v[i]()()); T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()()); T_v[i]()(1, 0) = (F_v[i]()());
@ -354,9 +355,9 @@ private:
T = Zero(); T = Zero();
auto T_v = T.View(); autoView( T_v , T,AcceleratorWrite);
auto F_v = F.View(); autoView( F_v , F,AcceleratorRead);
thread_for(i, CloverTerm.Grid()->oSites(), accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{ {
T_v[i]()(0, 0) = timesI(F_v[i]()()); T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()()); T_v[i]()(1, 1) = timesMinusI(F_v[i]()());

View File

@ -106,10 +106,10 @@ public:
const _SpinorField & phi, const _SpinorField & phi,
int mu) int mu)
{ {
auto out_v= out.View(); autoView( out_v, out, AcceleratorWrite);
auto phi_v= phi.View(); autoView( phi_v, phi, AcceleratorRead);
auto Umu_v= Umu.View(); autoView( Umu_v, Umu, AcceleratorRead);
thread_for(sss,out.Grid()->oSites(),{ accelerator_for(sss,out.Grid()->oSites(),1,{
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
}); });
} }
@ -191,18 +191,19 @@ public:
int Ls=Btilde.Grid()->_fdimensions[0]; int Ls=Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
auto tmp_v = tmp.View(); {
auto Btilde_v = Btilde.View(); autoView( tmp_v , tmp, AcceleratorWrite);
auto Atilde_v = Atilde.View(); autoView( Btilde_v , Btilde, AcceleratorRead);
thread_for(sss,tmp.Grid()->oSites(),{ autoView( Atilde_v , Atilde, AcceleratorRead);
int sU=sss; accelerator_for(sss,tmp.Grid()->oSites(),1,{
for(int s=0;s<Ls;s++){ int sU=sss;
int sF = s+Ls*sU; for(int s=0;s<Ls;s++){
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here int sF = s+Ls*sU;
} tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
}); }
});
}
PokeIndex<LorentzIndex>(mat,tmp,mu); PokeIndex<LorentzIndex>(mat,tmp,mu);
} }
}; };

View File

@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl; std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl; std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl; std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
#ifdef GRID_NVCC #ifdef GRID_CUDA
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl; std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl; std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type, Current curr_type,
unsigned int mu) unsigned int mu)
{ {
#ifndef GRID_NVCC #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = { Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX, Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY, Gamma::Algebra::GammaY,
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
} }
#endif #endif
#ifndef GRID_NVCC #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0; int tshift = (mu == Nd-1) ? 1 : 0;
//////////////////////////////////////////////// ////////////////////////////////////////////////
// GENERAL CAYLEY CASE // GENERAL CAYLEY CASE

View File

@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
{ {
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
int Ls=this->Ls; int Ls=this->Ls;
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
int Ls=this->Ls; int Ls=this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i,AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i,AcceleratorWrite);
auto plee = & lee [0]; auto plee = & lee [0];
auto pdee = & dee [0]; auto pdee = & dee [0];

View File

@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi = psi_i.View(); autoView(psi, psi_i,CpuRead);
auto phi = phi_i.View(); autoView(phi, phi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi, chi_i,CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
const int nsimd= Simd::Nsimd(); const int nsimd= Simd::Nsimd();
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
chi_i.Checkerboard()=psi_i.Checkerboard(); chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid(); GridBase *grid=psi_i.Grid();
auto psi=psi_i.View(); autoView(psi,psi_i,CpuRead);
auto phi=phi_i.View(); autoView(phi,phi_i,CpuRead);
auto chi=chi_i.View(); autoView(chi,chi_i,CpuWrite);
int Ls = this->Ls; int Ls = this->Ls;
int LLs = grid->_rdimensions[0]; int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd(); int nsimd= Simd::Nsimd();
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
Vector<iSinglet<Simd> > &Matm) Vector<iSinglet<Simd> > &Matm)
{ {
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0; EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
#ifndef AVX512 #ifndef AVX512
{ {
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
EnableIf<Impl::LsVectorised,int> sfinae=0; EnableIf<Impl::LsVectorised,int> sfinae=0;
#ifndef AVX512 #ifndef AVX512
{ {
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
SiteHalfSpinor BcastP; SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM; SiteHalfSpinor BcastM;
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
} }
#else #else
{ {
auto psi = psi_i.View(); autoView(psi , psi_i,CpuRead);
auto chi = chi_i.View(); autoView(chi , chi_i,CpuWrite);
// pointers // pointers
// MASK_REGS; // MASK_REGS;
#define Chi_00 %zmm0 #define Chi_00 %zmm0

View File

@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
int Ls = this->Ls; int Ls = this->Ls;
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto phi = phi_i.View(); autoView( phi , phi_i, AcceleratorRead);
auto psi = psi_i.View(); autoView( psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView( psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView( phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &diag[0]; auto pdiag = &diag[0];
auto pupper = &upper[0]; auto pupper = &upper[0];
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi=psi_i.View(); autoView( psi, psi_i, AcceleratorRead);
auto chi=chi_i.View(); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid(); GridBase* grid = psi_i.Grid();
auto psi = psi_i.View(); autoView( psi, psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto plee = & this->lee[0]; auto plee = & this->lee[0];

View File

@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
auto Umu_v = Umu.View(); autoView( Umu_v , Umu, CpuRead);
auto UUUmu_v = UUUmu.View(); autoView( UUUmu_v , UUUmu, CpuRead);
auto in_v = in.View(); autoView( in_v , in, CpuRead);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for( ss,Umu.Grid()->oSites(),{ thread_for( ss,Umu.Grid()->oSites(),{
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
} }
@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DoubledGaugeField & U,DoubledGaugeField & UUU, DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor; Compressor compressor;
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor // st.HaloExchangeOptGather(in,compressor); // Wilson compressor
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons. // Remove explicit thread mapping introduced for OPA reasons.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime) DhopComputeTime-=usecond();
{ {
int tid = omp_get_thread_num(); int interior=1;
int nthreads = omp_get_num_threads(); int exterior=0;
int ncomms = CartesianCommunicator::nCommThreads; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U.Grid()->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
} }
DhopCommTime += ctime; DhopComputeTime+=usecond();
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond(); DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond(); DhopFaceTime+=usecond();
DhopComputeTime2-=usecond(); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
auto U_v = U.View(); DhopComputeTime2-=usecond();
auto UUU_v = UUU.View(); {
auto in_v = in.View(); int interior=0;
auto out_v = out.View(); int exterior=1;
if (dag == DaggerYes) { Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
});
} else {
int sz=st.surface_list.size();
thread_for( ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
});
} }
DhopComputeTime2+=usecond(); DhopComputeTime2+=usecond();
#else
assert(0);
#endif
} }
template<class Impl> template<class Impl>
@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
Compressor compressor; Compressor compressor;
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
//double t1=usecond(); //double t1=usecond();
DhopTotalTime -= usecond(); DhopTotalTime -= usecond();
DhopCommTime -= usecond(); DhopCommTime -= usecond();
@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
auto U_v = U.View(); {
auto UUU_v = UUU.View(); int interior=1;
auto in_v = in.View(); int exterior=1;
auto out_v = out.View(); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (dag == DaggerYes) {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
});
} else {
thread_for( ss,U.Grid()->oSites(),{
int sU=ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
});
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
} }
/*CHANGE END*/ /*CHANGE END*/

View File

@ -258,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
//////////////////////// ////////////////////////
// Call the single hop // Call the single hop
//////////////////////// ////////////////////////
auto U_v = U.View(); autoView( U_v , U, CpuRead);
auto UUU_v = UUU.View(); autoView( UUU_v , UUU, CpuRead);
auto B_v = B.View(); autoView( B_v , B, CpuWrite);
auto Btilde_v = Btilde.View(); autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{ thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1); Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
}); });
@ -386,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
Compressor compressor; Compressor compressor;
Stencil.HaloExchange(in, compressor); Stencil.HaloExchange(in, compressor);
auto Umu_v = Umu.View(); autoView( Umu_v , Umu, CpuRead);
auto UUUmu_v = UUUmu.View(); autoView( UUUmu_v , UUUmu, CpuRead);
auto in_v = in.View(); autoView( in_v , in, CpuRead);
auto out_v = out.View(); autoView( out_v , out, CpuWrite);
thread_for( sss, in.Grid()->oSites(),{ thread_for( sss, in.Grid()->oSites(),{
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp); Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
}); });
@ -403,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
} }
template <class Impl> template <class Impl>
@ -417,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
#ifdef GRID_OMP
Compressor compressor; Compressor compressor;
int len = U.Grid()->oSites(); int len = U.Grid()->oSites();
@ -426,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopFaceTime -= usecond(); DhopFaceTime -= usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
DhopFaceTime += usecond(); DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons. // Removed explicit thread comms
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
#pragma omp parallel
{ {
int tid = omp_get_thread_num(); int interior=1;
int nthreads = omp_get_num_threads(); int exterior=0;
int ncomms = CartesianCommunicator::nCommThreads; Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
}
}
} else {
st.CommunicateThreaded();
}
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing // First to enter, last to leave timing
DhopFaceTime -= usecond(); DhopFaceTime -= usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
@ -487,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
DhopComputeTime2 -= usecond(); DhopComputeTime2 -= usecond();
{ {
auto U_v = U.View(); int interior=0;
auto UUU_v = UUU.View(); int exterior=1;
auto in_v = in.View(); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
auto out_v = out.View();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
} else {
int sz=st.surface_list.size();
thread_for(ss,sz,{
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
});
}
} }
DhopComputeTime2 += usecond(); DhopComputeTime2 += usecond();
#else
assert(0);
#endif
} }
@ -528,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond(); DhopCommTime += usecond();
auto U_v = U.View();
auto UUU_v = UUU.View();
auto in_v = in.View();
auto out_v = out.View();
DhopComputeTime -= usecond(); DhopComputeTime -= usecond();
if (dag == DaggerYes) { {
thread_for(sss, in.Grid()->oSites(),{ int interior=1;
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v); int exterior=1;
}); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} else {
thread_for(sss, in.Grid()->oSites(),{
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
});
} }
DhopComputeTime += usecond(); DhopComputeTime += usecond();
DhopTotalTime += usecond(); DhopTotalTime += usecond();

View File

@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto phi = phi_i.View(); autoView(phi , phi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto pm = this->pm; auto pm = this->pm;
auto plee = & this->lee [0]; auto plee = & this->lee [0];
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
int Ls = this->Ls; int Ls = this->Ls;
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->lee [0]; auto plee = & this->lee [0];
auto pdee = & this->dee [0]; auto pdee = & this->dee [0];
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
{ {
chi_i.Checkerboard() = psi_i.Checkerboard(); chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid(); GridBase *grid = psi_i.Grid();
auto psi = psi_i.View(); autoView(psi , psi_i, AcceleratorRead);
auto chi = chi_i.View(); autoView(chi , chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
auto pm = this->pm; auto pm = this->pm;

View File

@ -0,0 +1,499 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////
// Constructor and gauge import
/////////////////////////////////
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid)
{
int vol4;
int LLs=1;
c1=_c1;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p)
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
{
ImportGauge(_U);
}
////////////////////////////////////////////////////////////
// Momentum space propagator should be
// https://arxiv.org/pdf/hep-lat/9712010.pdf
//
// mom space action.
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
//
// must track through staggered flavour/spin reduction in literature to
// turn to free propagator for the one component chi field, a la page 4/5
// of above link to implmement fourier based solver.
////////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
{
GaugeLinkField U(GaugeGrid());
DoubledGaugeField _UUU(GaugeGrid());
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
// Discard teh Naik as Naive
////////////////////////////////////////////////////////
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
////////////////////////////////////////////////////////
// Apply scale factors to get the right fermion Kinetic term
// Could pass coeffs into the double store to save work.
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
}
CopyGaugeCheckerboards();
}
/////////////////////////////
// Implement the interface
/////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerNo);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Dhop(in, out, DaggerYes);
axpy(out, mass, in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(mass);
out = scal * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
out = (1.0 / (mass)) * in;
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in, out);
}
///////////////////////////////////
// Internal
///////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField & mat,
const FermionField &A, const FermionField &B, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor;
FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid());
Atilde = A;
st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) {
////////////////////////
// Call the single hop
////////////////////////
autoView( U_v , U, CpuRead);
autoView( B_v , B, CpuWrite);
autoView( Btilde_v , Btilde, CpuWrite);
thread_for(sss,B.Grid()->oSites(),{
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
});
assert(0);// need to figure out the force interface with a blasted three link term.
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _grid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
mat.Checkerboard() = U.Checkerboard();
DerivInternal(Stencil, Umu, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Even);
assert(U.Checkerboard() == Odd);
mat.Checkerboard() = Odd;
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U.Grid(), _cbgrid);
conformable(U.Grid(), V.Grid());
conformable(U.Grid(), mat.Grid());
assert(V.Checkerboard() == Odd);
assert(U.Checkerboard() == Even);
mat.Checkerboard() = Even;
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
{
DhopDir(in, out, dir, disp);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
{
assert(0); // Not implemented yet
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
{
Compressor compressor;
Stencil.HaloExchange(in, compressor);
autoView( Umu_v , Umu, CpuRead);
autoView( in_v , in, CpuRead);
autoView( out_v , out, CpuWrite);
// thread_for( sss, in.Grid()->oSites(),{
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
// });
assert(0);
};
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,lo,U,in,out,dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
Compressor compressor;
int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=0;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{
int interior=0;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime2 += usecond();
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{
int interior=1;
int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void NaiveStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
assert(0);
}
NAMESPACE_END(Grid);

View File

@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
assert(0); assert(0);
@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge2 =(uint64_t)&UU[sU]( Z ); \
gauge3 =(uint64_t)&UU[sU]( T ); gauge3 =(uint64_t)&UU[sU]( T );
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h> #include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHI(addr0,addr1,addr2,addr3);
@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
} }
#include <Grid/simd/Intel512double.h> #include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dag) int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3); LOAD_CHI(addr0,addr1,addr2,addr3);
@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h> #include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
int sF=s+LLs*sU; {
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1); LOAD_CHIa(addr0,addr1);
@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
} }
#include <Grid/simd/Intel512double.h> #include <Grid/simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &U,
DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out,int dag) int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
#ifdef AVX512 #ifdef AVX512
@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
StencilEntry *SE2; StencilEntry *SE2;
StencilEntry *SE3; StencilEntry *SE3;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=s+LLs*sU;
int sF=s+LLs*sU; {
// Xp, Yp, Zp, Tp // Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U); PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1); LOAD_CHIa(addr0,addr1);

View File

@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
skew = 0; skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even); HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG (U,Ym,2,skew,odd); HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even); HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd); HAND_STENCIL_LEG (U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even); HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even); HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
}
if ( dag ) { if ( dag ) {
result()()(0) = - even_0 - odd_0; result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1; result()()(1) = - even_1 - odd_1;
@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even); HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd); HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
}
// Assume every site must be connected to at least one interior point. No 1^4 subvols. // Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) { if ( dag ) {
result()()(0) = - even_0 - odd_0; result()()(0) = - even_0 - odd_0;
@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
StencilEntry *SE; StencilEntry *SE;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=s+LLs*sU; // int sF=s+LLs*sU;
{
even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd); HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
if (Naik) {
skew = 8; skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even); HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd); HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
}
// Add sum of all exterior connected stencil legs // Add sum of all exterior connected stencil legs
if ( nmu ) { if ( nmu ) {
if ( dag ) { if ( dag ) {
@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
} }
} }
/*
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ #define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \ SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \ const FermionFieldView &in, FermionFieldView &out, int dag); \
*/
#undef LOAD_CHI #undef LOAD_CHI
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \ if (SE->_is_local ) { \
if (SE->_permute) { \ if (SE->_permute) { \
chi_p = &chi; \ chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \ permute(chi, in[SE->_offset], ptype); \
} else { \ } else { \
chi_p = &in[SE->_offset]; \ chi_p = &in[SE->_offset]; \
} \ } \
} else { \ } else { \
chi_p = &buf[SE->_offset]; \ chi_p = &buf[SE->_offset]; \
@ -51,15 +51,15 @@ NAMESPACE_BEGIN(Grid);
if (SE->_is_local ) { \ if (SE->_is_local ) { \
if (SE->_permute) { \ if (SE->_permute) { \
chi_p = &chi; \ chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \ permute(chi, in[SE->_offset], ptype); \
} else { \ } else { \
chi_p = &in[SE->_offset]; \ chi_p = &in[SE->_offset]; \
} \ } \
} else if ( st.same_node[Dir] ) { \ } else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \ chi_p = &buf[SE->_offset]; \
} \ } \
if (SE->_is_local || st.same_node[Dir] ) { \ if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U[sU], *chi_p, Dir); \ multLink(Uchi, U[sU], *chi_p, Dir); \
} }
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
@ -67,7 +67,7 @@ NAMESPACE_BEGIN(Grid);
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \ nmu++; \
chi_p = &buf[SE->_offset]; \ chi_p = &buf[SE->_offset]; \
multLink(Uchi, U[sU], *chi_p, Dir); \ multLink(Uchi, U[sU], *chi_p, Dir); \
} }
template <class Impl> template <class Impl>
@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap // Int, Ext, Int+Ext cases for comms overlap
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag) { const FermionFieldView &in, FermionFieldView &out, int dag)
{
const SiteSpinor *chi_p; const SiteSpinor *chi_p;
SiteSpinor chi; SiteSpinor chi;
SiteSpinor Uchi; SiteSpinor Uchi;
@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
int ptype; int ptype;
int skew; int skew;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; //
// int sF=LLs*sU+s;
{
skew = 0; skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink); GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
// Only contributions from interior of our node // Only contributions from interior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) { const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p; const SiteSpinor *chi_p;
SiteSpinor chi; SiteSpinor chi;
@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
int ptype; int ptype;
int skew ; int skew ;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; // int sF=LLs*sU+s;
{
skew = 0; skew = 0;
Uchi=Zero(); Uchi=Zero();
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
// Only contributions from exterior of our node // Only contributions from exterior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo, template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) { const FermionFieldView &in, FermionFieldView &out,int dag) {
const SiteSpinor *chi_p; const SiteSpinor *chi_p;
// SiteSpinor chi; // SiteSpinor chi;
@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
int nmu=0; int nmu=0;
int skew ; int skew ;
for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
int sF=LLs*sU+s; // int sF=LLs*sU+s;
{
skew = 0; skew = 0;
Uchi=Zero(); Uchi=Zero();
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
if ( Naik ) {
skew=8; skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
}
if ( nmu ) { if ( nmu ) {
if ( dag ) { if ( dag ) {
out[sF] = out[sF] - Uchi; out[sF] = out[sF] - Uchi;
@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel // Driving / wrapping routine to select right kernel
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
SiteSpinor *buf, int LLs, int sU, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=1;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionFieldView &in, FermionFieldView &out,
int dag,int interior,int exterior)
{
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break;
#endif
case OptHandUnroll:
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
};
template <class Impl>
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
{ {
// Disp should be either +1,-1,+3,-3 // Disp should be either +1,-1,+3,-3
// What about "dag" ? // What about "dag" ?
@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
assert(0); assert(0);
} }
#define KERNEL_CALLNB(A,improved) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
#define ASM_CALL(A) \
const uint64_t NN = Nsite*Ls; \
thread_for( ss, NN, { \
int sF = ss; \
int sU = ss/Ls; \
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
});
template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
autoView( UUU_v , UUU, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel;
autoView( UUU_v , U, AcceleratorRead);
autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
SiteSpinor * buf = st.CommBuf();
int Ls=1;
if(FGrid->Nd()==UGrid->Nd()+1){
Ls = FGrid->_rdimensions[0];
}
int Nsite = UGrid->oSites();
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -98,32 +98,35 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
Coordinate lcoor; Coordinate lcoor;
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
for (int site = 0; site < lvol; site++)
{ {
grid->LocalIndexToLocalCoor(site, lcoor); autoView(CTv,CloverTerm,CpuRead);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); autoView(CTIv,CloverTermInv,CpuWrite);
peekLocalSite(Qx, CloverTerm, lcoor); for (int site = 0; site < lvol; site++) {
Qxinv = Zero(); grid->LocalIndexToLocalCoor(site, lcoor);
//if (csw!=0){ EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
for (int j = 0; j < Ns; j++) peekLocalSite(Qx, CTv, lcoor);
for (int k = 0; k < Ns; k++) Qxinv = Zero();
for (int a = 0; a < DimRep; a++) //if (csw!=0){
for (int b = 0; b < DimRep; b++){ for (int j = 0; j < Ns; j++)
auto zz = Qx()(j, k)(a, b); for (int k = 0; k < Ns; k++)
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz); for (int a = 0; a < DimRep; a++)
} for (int b = 0; b < DimRep; b++){
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; auto zz = Qx()(j, k)(a, b);
EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex<double>(zz);
}
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse(); EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl; //std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++) for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++) for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++) for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++) for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// } // }
pokeLocalSite(Qxinv, CloverTermInv, lcoor); pokeLocalSite(Qxinv, CTIv, lcoor);
}
} }
// Separate the even and odd parts // Separate the even and odd parts

View File

@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh // FIXME Need a Lattice acosh
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd); {
Tcomplex cc; autoView(cosha_v,cosha,CpuRead);
// RealD sgn; autoView(a_v,a,CpuWrite);
_grid->LocalIndexToLocalCoor(idx,lcoor); for(int idx=0;idx<_grid->lSites();idx++){
peekLocalSite(cc,cosha,lcoor); Coordinate lcoor(Nd);
assert((double)real(cc)>=1.0); Tcomplex cc;
assert(fabs((double)imag(cc))<=1.0e-15); // RealD sgn;
cc = ScalComplex(::acosh(real(cc)),0.0); _grid->LocalIndexToLocalCoor(idx,lcoor);
pokeLocalSite(cc,a,lcoor); peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}
} }
Wea = ( exp( a) * abs(W) ); Wea = ( exp( a) * abs(W) );
@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
// FIXME Need a Lattice acosh // FIXME Need a Lattice acosh
{
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){ for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd); Coordinate lcoor(Nd);
Tcomplex cc; Tcomplex cc;
// RealD sgn; // RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor); _grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha,lcoor); peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0); assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15); assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0); cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a,lcoor); pokeLocalSite(cc,a_v,lcoor);
} }}
Wea = ( exp( a) * abs(W) ); Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) ); Wema= ( exp(-a) * abs(W) );

View File

@ -67,7 +67,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
diag_mass = 4.0 + mass; diag_mass = 4.0 + mass;
} }
int vol4;
vol4=Fgrid.oSites();
Stencil.BuildSurfaceList(1,vol4);
vol4=Hgrid.oSites();
StencilEven.BuildSurfaceList(1,vol4);
StencilOdd.BuildSurfaceList(1,vol4);
} }
template <class Impl> template <class Impl>
@ -483,32 +488,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
conformable(_grid, q_in_1.Grid()); conformable(_grid, q_in_1.Grid());
conformable(_grid, q_in_2.Grid()); conformable(_grid, q_in_2.Grid());
conformable(_grid, q_out.Grid()); conformable(_grid, q_out.Grid());
#if 0 assert(0);
PropagatorField tmp1(_grid), tmp2(_grid);
q_out = Zero();
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
// Inefficient comms method but not performance critical.
tmp1 = Cshift(q_in_1, mu, 1);
tmp2 = Cshift(q_in_2, mu, 1);
auto tmp1_v = tmp1.View();
auto tmp2_v = tmp2.View();
auto q_in_1_v=q_in_1.View();
auto q_in_2_v=q_in_2.View();
auto q_out_v = q_out.View();
auto Umu_v = Umu.View();
thread_for(sU, Umu.Grid()->oSites(),{
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
q_in_2_v[sU],
q_out_v[sU],
Umu_v, sU, mu);
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
tmp2_v[sU],
q_out_v[sU],
Umu_v, sU, mu);
});
#else
#endif
} }
@ -524,62 +504,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
{ {
conformable(_grid, q_in.Grid()); conformable(_grid, q_in.Grid());
conformable(_grid, q_out.Grid()); conformable(_grid, q_out.Grid());
#if 0 assert(0);
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
Complex i(0.0,1.0);
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
q_out = Zero();
LatticeInteger coords(_grid);
LatticeCoordinate(coords, Tp);
// Need q(x + mu) and q(x - mu).
tmp = Cshift(q_in, mu, 1);
tmpFwd = tmp*lattice_cmplx;
tmp = lattice_cmplx*q_in;
tmpBwd = Cshift(tmp, mu, -1);
auto coords_v = coords.View();
auto tmpFwd_v = tmpFwd.View();
auto tmpBwd_v = tmpBwd.View();
auto Umu_v = Umu.View();
auto q_out_v = q_out.View();
thread_for(sU, Umu.Grid()->oSites(), {
// Compute the sequential conserved current insertion only if our simd
// object contains a timeslice we need.
vPredicate t_mask;
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
Integer timeSlices = Reduce(t_mask());
if (timeSlices > 0) {
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
q_out_v[sU],
Umu_v, sU, mu, t_mask);
}
// Repeat for backward direction.
t_mask() = ((coords_v[sU] >= (tmin + tshift)) &&
(coords_v[sU] <= (tmax + tshift)));
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
unsigned int t0 = 0;
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
timeSlices = Reduce(t_mask());
if (timeSlices > 0) {
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
q_out_v[sU],
Umu_v, sU, mu, t_mask);
}
});
#else
#endif
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -39,9 +39,10 @@ NAMESPACE_BEGIN(Grid);
// Generic implementation; move to different file? // Generic implementation; move to different file?
//////////////////////////////////////////// ////////////////////////////////////////////
/*
accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
{ {
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size"); static_assert(sizeof(StencilEntry)==sizeof(uint4),"Unexpected Stencil Entry Size");
uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads uint4 * mem_pun = (uint4 *)mem; // force 128 bit loads
uint4 * chip_pun = (uint4 *)&chip; uint4 * chip_pun = (uint4 *)&chip;
@ -51,6 +52,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
#endif #endif
return; return;
} }
*/
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \ #define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \ SE = st.GetEntry(ptype, Dir, sF); \
@ -61,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \ } else { \
chi = coalescedRead(buf[SE->_offset],lane); \ chi = coalescedRead(buf[SE->_offset],lane); \
} \ } \
synchronise(); \ acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); Recon(result, Uchi);
@ -74,12 +76,12 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else if ( st.same_node[Dir] ) { \ } else if ( st.same_node[Dir] ) { \
chi = coalescedRead(buf[SE->_offset],lane); \ chi = coalescedRead(buf[SE->_offset],lane); \
} \ } \
synchronise(); \ acceleratorSynchronise(); \
if (SE->_is_local || st.same_node[Dir] ) { \ if (SE->_is_local || st.same_node[Dir] ) { \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \ Recon(result, Uchi); \
} \ } \
synchronise(); acceleratorSynchronise();
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \ SE = st.GetEntry(ptype, Dir, sF); \
@ -89,7 +91,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
Recon(result, Uchi); \ Recon(result, Uchi); \
nmu++; \ nmu++; \
} \ } \
synchronise(); acceleratorSynchronise();
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \ #define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
if (SE->_is_local ) { \ if (SE->_is_local ) { \
@ -99,7 +101,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \ } else { \
chi = coalescedRead(buf[SE->_offset],lane); \ chi = coalescedRead(buf[SE->_offset],lane); \
} \ } \
synchronise(); \ acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \ Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
Recon(result, Uchi); Recon(result, Uchi);
@ -126,7 +128,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd(); const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd); const int lane=acceleratorSIMTlane(Nsimd);
GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp); GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp); GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp); GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
@ -153,7 +155,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
int ptype; int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd(); const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd); const int lane=acceleratorSIMTlane(Nsimd);
GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp); GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp); GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp); GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
@ -181,7 +183,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd(); const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd); const int lane=acceleratorSIMTlane(Nsimd);
result=Zero(); result=Zero();
GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp); GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
@ -203,7 +205,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; typedef decltype(coalescedRead(buf[0])) calcHalfSpinor;
typedef decltype(coalescedRead(in[0])) calcSpinor; typedef decltype(coalescedRead(in[0])) calcSpinor;
const int Nsimd = SiteHalfSpinor::Nsimd(); const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd); const int lane=acceleratorSIMTlane(Nsimd);
calcHalfSpinor chi; calcHalfSpinor chi;
// calcHalfSpinor *chi_p; // calcHalfSpinor *chi_p;
@ -239,7 +241,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
int ptype; int ptype;
int nmu=0; int nmu=0;
const int Nsimd = SiteHalfSpinor::Nsimd(); const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd); const int lane=acceleratorSIMTlane(Nsimd);
result=Zero(); result=Zero();
GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp); GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp); GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
@ -270,7 +272,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
int ptype; int ptype;
int nmu=0; int nmu=0;
const int Nsimd = SiteHalfSpinor::Nsimd(); const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd); const int lane=acceleratorSIMTlane(Nsimd);
result=Zero(); result=Zero();
GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp); GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp); GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
@ -300,7 +302,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
StencilEntry *SE; \ StencilEntry *SE; \
int ptype; \ int ptype; \
const int Nsimd = SiteHalfSpinor::Nsimd(); \ const int Nsimd = SiteHalfSpinor::Nsimd(); \
const int lane=SIMTlane(Nsimd); \ const int lane=acceleratorSIMTlane(Nsimd); \
\ \
SE = st.GetEntry(ptype, dir, sF); \ SE = st.GetEntry(ptype, dir, sF); \
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \ GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
@ -328,7 +330,7 @@ void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,Si
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
const int Nsimd = SiteHalfSpinor::Nsimd(); const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=SIMTlane(Nsimd); const int lane=acceleratorSIMTlane(Nsimd);
SE = st.GetEntry(ptype, dir, sF); SE = st.GetEntry(ptype, dir, sF);
GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp); GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
@ -346,30 +348,30 @@ template <class Impl>
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls, void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
int Nsite, const FermionField &in, std::vector<FermionField> &out) int Nsite, const FermionField &in, std::vector<FermionField> &out)
{ {
auto U_v = U.View(); autoView(U_v ,U,AcceleratorRead);
auto in_v = in.View(); autoView(in_v ,in,AcceleratorRead);
auto st_v = st.View(); autoView(st_v ,st,AcceleratorRead);
auto out_Xm = out[0].View(); autoView(out_Xm,out[0],AcceleratorWrite);
auto out_Ym = out[1].View(); autoView(out_Ym,out[1],AcceleratorWrite);
auto out_Zm = out[2].View(); autoView(out_Zm,out[2],AcceleratorWrite);
auto out_Tm = out[3].View(); autoView(out_Tm,out[3],AcceleratorWrite);
auto out_Xp = out[4].View(); autoView(out_Xp,out[4],AcceleratorWrite);
auto out_Yp = out[5].View(); autoView(out_Yp,out[5],AcceleratorWrite);
auto out_Zp = out[6].View(); autoView(out_Zp,out[6],AcceleratorWrite);
auto out_Tp = out[7].View(); autoView(out_Tp,out[7],AcceleratorWrite);
auto CBp=st.CommBuf();
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{ accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
int sU=sss/Ls; int sU=sss/Ls;
int sF =sss; int sF =sss;
DhopDirXm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xm,0); DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
DhopDirYm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Ym,1); DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
DhopDirZm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zm,2); DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
DhopDirTm(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tm,3); DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
DhopDirXp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Xp,4); DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
DhopDirYp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Yp,5); DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
DhopDirZp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Zp,6); DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
DhopDirTp(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_Tp,7); DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
}); });
} }
@ -381,17 +383,18 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
assert(dirdisp<=7); assert(dirdisp<=7);
assert(dirdisp>=0); assert(dirdisp>=0);
auto U_v = U.View(); autoView(U_v ,U ,AcceleratorRead);
auto in_v = in.View(); autoView(in_v ,in ,AcceleratorRead);
auto out_v = out.View(); autoView(out_v,out,AcceleratorWrite);
auto st_v = st.View(); autoView(st_v ,st ,AcceleratorRead);
auto CBp=st.CommBuf();
#define LoopBody(Dir) \ #define LoopBody(Dir) \
case Dir : \ case Dir : \
accelerator_forNB(ss,Nsite,Simd::Nsimd(),{ \ accelerator_for(ss,Nsite,Simd::Nsimd(),{ \
for(int s=0;s<Ls;s++){ \ for(int s=0;s<Ls;s++){ \
int sU=ss; \ int sU=ss; \
int sF = s+Ls*sU; \ int sF = s+Ls*sU; \
DhopDir##Dir(st_v,U_v,st.CommBuf(),sF,sU,in_v,out_v,dirdisp);\ DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
} \ } \
}); \ }); \
break; break;
@ -435,26 +438,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior) int interior,int exterior)
{ {
auto U_v = U.View(); autoView(U_v , U,AcceleratorRead);
auto in_v = in.View(); autoView(in_v , in,AcceleratorRead);
auto out_v = out.View(); autoView(out_v,out,AcceleratorWrite);
auto st_v = st.View(); autoView(st_v , st,AcceleratorRead);
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifndef GRID_NVCC #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif #endif
@ -466,26 +469,26 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior) int interior,int exterior)
{ {
auto U_v = U.View(); autoView(U_v ,U,AcceleratorRead);
auto in_v = in.View(); autoView(in_v ,in,AcceleratorRead);
auto out_v = out.View(); autoView(out_v,out,AcceleratorWrite);
auto st_v = st.View(); autoView(st_v ,st,AcceleratorRead);
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
#ifndef GRID_NVCC #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
#ifndef GRID_NVCC #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
#ifndef GRID_NVCC #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif #endif
@ -493,5 +496,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");
} }
#undef KERNEL_CALLNB
#undef KERNEL_CALL
#undef ASM_CALL
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -0,0 +1,36 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
NAMESPACE_BEGIN(Grid);
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
NAMESPACE_END(Grid);

View File

@ -0,0 +1,37 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/NaiveStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class NaiveStaggeredFermion<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../NaiveStaggeredFermionInstantiation.cc.master

View File

@ -0,0 +1 @@
../NaiveStaggeredFermionInstantiation.cc.master

View File

@ -88,6 +88,7 @@ done
CC_LIST=" \ CC_LIST=" \
ImprovedStaggeredFermion5DInstantiation \ ImprovedStaggeredFermion5DInstantiation \
ImprovedStaggeredFermionInstantiation \ ImprovedStaggeredFermionInstantiation \
NaiveStaggeredFermionInstantiation \
StaggeredKernelsInstantiation " StaggeredKernelsInstantiation "
for impl in $STAG_IMPL_LIST for impl in $STAG_IMPL_LIST

View File

@ -86,9 +86,9 @@ public:
// Move this elsewhere? FIXME // Move this elsewhere? FIXME
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
auto U_v = U.View(); autoView(U_v,U,AcceleratorWrite);
auto W_v = W.View(); autoView(W_v,W,AcceleratorRead);
thread_for( ss, U.Grid()->oSites(), { accelerator_for( ss, U.Grid()->oSites(), 1, {
U_v[ss](mu) = U_v[ss](mu) + W_v[ss](); U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
}); });
} }
@ -131,15 +131,14 @@ public:
//static std::chrono::duration<double> diff; //static std::chrono::duration<double> diff;
//auto start = std::chrono::high_resolution_clock::now(); //auto start = std::chrono::high_resolution_clock::now();
auto U_v = U.View(); autoView(U_v,U,AcceleratorWrite);
auto P_v = P.View(); autoView(P_v,P,AcceleratorRead);
thread_for(ss, P.Grid()->oSites(),{ accelerator_for(ss, P.Grid()->oSites(),1,{
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu)); U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
} }
}); });
//auto end = std::chrono::high_resolution_clock::now();
//auto end = std::chrono::high_resolution_clock::now();
// diff += end - start; // diff += end - start;
// std::cout << "Time to exponentiate matrix " << diff.count() << " s\n"; // std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
} }

View File

@ -89,8 +89,8 @@ public:
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared; action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
auto p_v = p.View(); autoView( p_v , p, CpuRead);
auto action_v = action.View(); autoView( action_v , action, CpuWrite);
for (int mu = 0; mu < Ndim; mu++) for (int mu = 0; mu < Ndim; mu++)
{ {
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils // pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
@ -146,8 +146,8 @@ public:
for (int point = 0; point < npoint; point++) for (int point = 0; point < npoint; point++)
{ {
auto p_v = p.View(); autoView( p_v , p, CpuRead);
auto force_v = force.View(); autoView( force_v , force, CpuWrite);
int permute_type; int permute_type;
StencilEntry *SE; StencilEntry *SE;

View File

@ -80,10 +80,11 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>, static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient"); HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient");
//static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,
// HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB"); static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,
//static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>, HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB");
// HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual"); static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual");
// add the staggered, scalar versions here // add the staggered, scalar versions here

View File

@ -49,7 +49,7 @@ public:
private: private:
const unsigned int smearingLevels; const unsigned int smearingLevels;
Smear_Stout<Gimpl> StoutSmearing; Smear_Stout<Gimpl> *StoutSmearing;
std::vector<GaugeField> SmearedSet; std::vector<GaugeField> SmearedSet;
// Member functions // Member functions
@ -72,7 +72,7 @@ private:
previous_u = *ThinLinks; previous_u = *ThinLinks;
for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
{ {
StoutSmearing.smear(SmearedSet[smearLvl], previous_u); StoutSmearing->smear(SmearedSet[smearLvl], previous_u);
previous_u = SmearedSet[smearLvl]; previous_u = SmearedSet[smearLvl];
// For debug purposes // For debug purposes
@ -93,7 +93,7 @@ private:
GaugeLinkField SigmaKPrime_mu(grid); GaugeLinkField SigmaKPrime_mu(grid);
GaugeLinkField GaugeKmu(grid), Cmu(grid); GaugeLinkField GaugeKmu(grid), Cmu(grid);
StoutSmearing.BaseSmear(C, GaugeK); StoutSmearing->BaseSmear(C, GaugeK);
SigmaK = Zero(); SigmaK = Zero();
iLambda = Zero(); iLambda = Zero();
@ -107,7 +107,7 @@ private:
pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
pokeLorentz(iLambda, iLambda_mu, mu); pokeLorentz(iLambda, iLambda_mu, mu);
} }
StoutSmearing.derivative(SigmaK, iLambda, StoutSmearing->derivative(SigmaK, iLambda,
GaugeK); // derivative of SmearBase GaugeK); // derivative of SmearBase
return SigmaK; return SigmaK;
} }
@ -144,14 +144,14 @@ private:
// Exponential // Exponential
iQ2 = iQ * iQ; iQ2 = iQ * iQ;
iQ3 = iQ * iQ2; iQ3 = iQ * iQ2;
StoutSmearing.set_uw(u, w, iQ2, iQ3); StoutSmearing->set_uw(u, w, iQ2, iQ3);
StoutSmearing.set_fj(f0, f1, f2, u, w); StoutSmearing->set_fj(f0, f1, f2, u, w);
e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2; e_iQ = f0 * unity + timesMinusI(f1) * iQ - f2 * iQ2;
// Getting B1, B2, Gamma and Lambda // Getting B1, B2, Gamma and Lambda
// simplify this part, reduntant calculations in set_fj // simplify this part, reduntant calculations in set_fj
xi0 = StoutSmearing.func_xi0(w); xi0 = StoutSmearing->func_xi0(w);
xi1 = StoutSmearing.func_xi1(w); xi1 = StoutSmearing->func_xi1(w);
u2 = u * u; u2 = u * u;
w2 = w * w; w2 = w * w;
cosw = cos(w); cosw = cos(w);
@ -219,7 +219,7 @@ public:
/* Standard constructor */ /* Standard constructor */
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear, SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
Smear_Stout<Gimpl>& Stout) Smear_Stout<Gimpl>& Stout)
: smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) : smearingLevels(Nsmear), StoutSmearing(&Stout), ThinLinks(NULL)
{ {
for (unsigned int i = 0; i < smearingLevels; ++i) for (unsigned int i = 0; i < smearingLevels; ++i)
SmearedSet.push_back(*(new GaugeField(UGrid))); SmearedSet.push_back(*(new GaugeField(UGrid)));
@ -227,7 +227,7 @@ public:
/*! For just thin links */ /*! For just thin links */
SmearedConfiguration() SmearedConfiguration()
: smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} : smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}
// attach the smeared routines to the thin links U and fill the smeared set // attach the smeared routines to the thin links U and fill the smeared set
void set_Field(GaugeField &U) void set_Field(GaugeField &U)

View File

@ -185,13 +185,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
for(int i=0;i<Lblock;i++){ for(int i=0;i<Lblock;i++){
auto lhs_v = lhs_wi[i].View(); // Recreate view potentially expensive outside fo UVM mode
autoView(lhs_v,lhs_wi[i],CpuRead);
auto left = conjugate(lhs_v[ss]); auto left = conjugate(lhs_v[ss]);
for(int j=0;j<Rblock;j++){ for(int j=0;j<Rblock;j++){
SpinMatrix_v vv; SpinMatrix_v vv;
auto rhs_v = rhs_vj[j].View(); // Recreate view potentially expensive outside fo UVM mode
autoView(rhs_v,rhs_vj[j],CpuRead);
auto right = rhs_v[ss]; auto right = rhs_v[ss];
for(int s1=0;s1<Ns;s1++){ for(int s1=0;s1<Ns;s1++){
for(int s2=0;s2<Ns;s2++){ for(int s2=0;s2<Ns;s2++){
@ -204,11 +205,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
for ( int m=0;m<Nmom;m++){ for ( int m=0;m<Nmom;m++){
int idx = m+base; int idx = m+base;
auto mom_v = mom[m].View(); autoView(mom_v,mom[m],CpuRead);
auto phase = mom_v[ss]; auto phase = mom_v[ss];
mac(&lvSum[idx],&vv,&phase); mac(&lvSum[idx],&vv,&phase);
} }
} }
} }
} }
@ -371,7 +371,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
for(int i=0;i<Lblock;i++){ for(int i=0;i<Lblock;i++){
auto wi_v = wi[i].View(); autoView(wi_v,wi[i],CpuRead);
auto w = conjugate(wi_v[ss]); auto w = conjugate(wi_v[ss]);
if (g5) { if (g5) {
w()(2)(0) = - w()(2)(0); w()(2)(0) = - w()(2)(0);
@ -383,7 +383,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
} }
for(int j=0;j<Rblock;j++){ for(int j=0;j<Rblock;j++){
auto vj_v=vj[j].View(); autoView(vj_v,vj[j],CpuRead);
auto v = vj_v[ss]; auto v = vj_v[ss];
auto vv = v()(0)(0); auto vv = v()(0)(0);
@ -518,12 +518,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
for(int i=0;i<Lblock;i++){ for(int i=0;i<Lblock;i++){
auto wi_v = wi[i].View(); autoView(wi_v,wi[i],CpuRead);
auto w = conjugate(wi_v[ss]); auto w = conjugate(wi_v[ss]);
for(int j=0;j<Rblock;j++){ for(int j=0;j<Rblock;j++){
auto vj_v = vj[j].View(); autoView(vj_v,vj[j],CpuRead);
auto v = vj_v[ss]; auto v = vj_v[ss];
auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
@ -544,7 +544,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r; int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
for ( int m=0;m<Nmom;m++){ for ( int m=0;m<Nmom;m++){
int idx = m+base; int idx = m+base;
auto mom_v = mom[m].View(); autoView(mom_v,mom[m],CpuRead);
auto phase = mom_v[ss]; auto phase = mom_v[ss];
mac(&lvSum[idx],&vv,&phase()()()); mac(&lvSum[idx],&vv,&phase()()());
} }
@ -730,13 +730,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
for(int i=0;i<Lblock;i++) for(int i=0;i<Lblock;i++)
{ {
auto wi_v = lhs_wi[i].View(); autoView(wi_v,lhs_wi[i],CpuRead);
auto left = conjugate(wi_v[ss]); auto left = conjugate(wi_v[ss]);
for(int j=0;j<Rblock;j++) for(int j=0;j<Rblock;j++)
{ {
SpinMatrix_v vv; SpinMatrix_v vv;
auto vj_v = rhs_vj[j].View(); autoView(vj_v,rhs_vj[j],CpuRead);
auto right = vj_v[ss]; auto right = vj_v[ss];
for(int s1=0;s1<Ns;s1++) for(int s1=0;s1<Ns;s1++)
@ -752,8 +752,8 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
for ( int m=0;m<Nem;m++) for ( int m=0;m<Nem;m++)
{ {
auto emB0_v = emB0[m].View(); autoView(emB0_v,emB0[m],CpuRead);
auto emB1_v = emB1[m].View(); autoView(emB1_v,emB1[m],CpuRead);
int idx = m+base; int idx = m+base;
auto b0 = emB0_v[ss]; auto b0 = emB0_v[ss];
auto b1 = emB1_v[ss]; auto b1 = emB1_v[ss];
@ -1014,21 +1014,21 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
for(int d_o=0;d_o<N_d;d_o+=d_unroll){ for(int d_o=0;d_o<N_d;d_o+=d_unroll){
for(int t=0;t<N_t;t++){ for(int t=0;t<N_t;t++){
for(int s=0;s<N_s;s++){ for(int s=0;s<N_s;s++){
auto vs_v = vs[s].View(); autoView(vs_v,vs[s],CpuRead);
auto tmp1 = vs_v[ss]; auto tmp1 = vs_v[ss];
vobj tmp2 = Zero(); vobj tmp2 = Zero();
vobj tmp3 = Zero(); vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){ for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
auto vd_v = vd[d].View(); autoView(vd_v,vd[d],CpuRead);
Scalar_v coeff = WW_sd(t,s,d); Scalar_v coeff = WW_sd(t,s,d);
tmp3 = conjugate(vd_v[ss]); tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3); mac(&tmp2, &coeff, &tmp3);
} }
////////////////////////// //////////////////////////
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
////////////////////////// //////////////////////////
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss); OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
}} }}
} }
@ -1067,21 +1067,20 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{
for(int d_o=0;d_o<N_d;d_o+=d_unroll){ for(int d_o=0;d_o<N_d;d_o+=d_unroll){
for(int s=0;s<N_s;s++){ for(int s=0;s<N_s;s++){
auto vs_v = vs[s].View(); autoView(vs_v,vs[s],CpuRead);
auto tmp1 = vs_v[ss]; auto tmp1 = vs_v[ss];
vobj tmp2 = Zero(); vobj tmp2 = Zero();
vobj tmp3 = Zero(); vobj tmp3 = Zero();
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){ for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
auto vd_v = vd[d].View(); autoView(vd_v,vd[d],CpuRead);
Scalar_v coeff = buf(s,d); Scalar_v coeff = buf(s,d);
tmp3 = conjugate(vd_v[ss]); tmp3 = conjugate(vd_v[ss]);
mac(&tmp2, &coeff, &tmp3); mac(&tmp2, &coeff, &tmp3);
} }
//////////////////////////
////////////////////////// // Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll //////////////////////////
////////////////////////// OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
}} }}
}); });
} }
@ -1093,7 +1092,7 @@ inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
const vobj &rhs, const vobj &rhs,
const int Ns, const int ss) const int Ns, const int ss)
{ {
auto WWVV_v = WWVV.View(); autoView(WWVV_v,WWVV,CpuWrite);
for (int s1 = 0; s1 < Ns; s1++){ for (int s1 = 0; s1 < Ns; s1++){
for (int s2 = 0; s2 < Ns; s2++){ for (int s2 = 0; s2 < Ns; s2++){
WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0); WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
@ -1122,10 +1121,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
GridBase *grid = WWVV0.Grid(); GridBase *grid = WWVV0.Grid();
auto WWVV0_v = WWVV0.View(); autoView(WWVV0_v , WWVV0,CpuRead);
auto WWVV1_v = WWVV1.View(); autoView(WWVV1_v , WWVV1,CpuRead);
auto O_trtr_v= O_trtr.View(); autoView(O_trtr_v, O_trtr,CpuWrite);
auto O_fig8_v= O_fig8.View(); autoView(O_fig8_v, O_fig8,CpuWrite);
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{
typedef typename ComplexField::vector_object vobj; typedef typename ComplexField::vector_object vobj;
@ -1166,10 +1165,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
GridBase *grid = WWVV0.Grid(); GridBase *grid = WWVV0.Grid();
auto WWVV0_v = WWVV0.View(); autoView( WWVV0_v , WWVV0,CpuRead);
auto WWVV1_v = WWVV1.View(); autoView( WWVV1_v , WWVV1,CpuRead);
auto O_trtr_v= O_trtr.View(); autoView( O_trtr_v, O_trtr,CpuWrite);
auto O_fig8_v= O_fig8.View(); autoView( O_fig8_v, O_fig8,CpuWrite);
thread_for(ss,grid->oSites(),{ thread_for(ss,grid->oSites(),{

View File

@ -351,10 +351,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
GridBase *grid = q1_left.Grid(); GridBase *grid = q1_left.Grid();
auto vbaryon_corr= baryon_corr.View(); autoView(vbaryon_corr, baryon_corr,CpuWrite);
auto v1 = q1_left.View(); autoView( v1 , q1_left, CpuRead);
auto v2 = q2_left.View(); autoView( v2 , q2_left, CpuRead);
auto v3 = q3_left.View(); autoView( v3 , q3_left, CpuRead);
Real bytes =0.; Real bytes =0.;
bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real));
@ -989,10 +989,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
GridBase *grid = qs_ti.Grid(); GridBase *grid = qs_ti.Grid();
auto vcorr= stn_corr.View(); autoView( vcorr, stn_corr, CpuWrite);
auto vq_loop = qq_loop.View(); autoView( vq_loop , qq_loop, CpuRead);
auto vd_tf = qd_tf.View(); autoView( vd_tf , qd_tf, CpuRead);
auto vs_ti = qs_ti.View(); autoView( vs_ti , qs_ti, CpuRead);
accelerator_for(ss, grid->oSites(), grid->Nsimd(), { accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
auto Dq_loop = vq_loop[ss]; auto Dq_loop = vq_loop[ss];
@ -1029,13 +1029,13 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
GridBase *grid = qs_ti.Grid(); GridBase *grid = qs_ti.Grid();
auto vcorr= stn_corr.View(); autoView( vcorr , stn_corr, CpuWrite);
auto vq_ti = qq_ti.View(); autoView( vq_ti , qq_ti, CpuRead);
auto vq_tf = qq_tf.View(); autoView( vq_tf , qq_tf, CpuRead);
auto vd_tf = qd_tf.View(); autoView( vd_tf , qd_tf, CpuRead);
auto vs_ti = qs_ti.View(); autoView( vs_ti , qs_ti, CpuRead);
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
accelerator_for(ss, grid->oSites(), grid->Nsimd(), { thread_for(ss,grid->oSites(),{
auto Dq_ti = vq_ti[ss]; auto Dq_ti = vq_ti[ss];
auto Dq_tf = vq_tf[ss]; auto Dq_tf = vq_tf[ss];
auto Dd_tf = vd_tf[ss]; auto Dd_tf = vd_tf[ss];

View File

@ -47,8 +47,8 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View(); autoView(x_v, x, AcceleratorRead);
auto z_v = z.View(); autoView(z_v, z, AcceleratorWrite);
accelerator_for( ss, x_v.size(),vobj::Nsimd(), { accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss))); auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
@ -63,9 +63,9 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
conformable(x,z); conformable(x,z);
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v, y, AcceleratorRead);
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
// FIXME -- need a new class of accelerator_loop to implement this // FIXME -- need a new class of accelerator_loop to implement this
// //
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v, y, AcceleratorRead);
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
conformable(x,z); conformable(x,z);
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v, y, AcceleratorRead);
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v, y, AcceleratorRead);
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v, y, AcceleratorRead);
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
conformable(x,z); conformable(x,z);
GridBase *grid=x.Grid(); GridBase *grid=x.Grid();
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
auto y_v = y.View(); autoView( y_v, y, AcceleratorRead);
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -189,8 +189,8 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
conformable(x,z); conformable(x,z);
int Ls = grid->_rdimensions[0]; int Ls = grid->_rdimensions[0];
Gamma G5(Gamma::Algebra::Gamma5); Gamma G5(Gamma::Algebra::Gamma5);
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,vobj::Nsimd(),{ accelerator_for(sss,nloop,vobj::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -222,8 +222,8 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
static_assert(nbasis % 2 == 0, ""); static_assert(nbasis % 2 == 0, "");
int nb = nbasis / 2; int nb = nbasis / 2;
auto z_v = z.View(); autoView( z_v, z, AcceleratorWrite);
auto x_v = x.View(); autoView( x_v, x, AcceleratorRead);
accelerator_for(ss,grid->oSites(),CComplex::Nsimd(), accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
{ {
for(int n = 0; n < nb; ++n) { for(int n = 0; n < nb; ++n) {

View File

@ -222,11 +222,11 @@ public:
conformable(subgroup, Determinant); conformable(subgroup, Determinant);
int i0, i1; int i0, i1;
su2SubGroupIndex(i0, i1, su2_index); su2SubGroupIndex(i0, i1, su2_index);
auto subgroup_v = subgroup.View();
auto source_v = source.View();
auto Determinant_v = Determinant.View();
thread_for(ss, grid->oSites(), { autoView( subgroup_v , subgroup,AcceleratorWrite);
autoView( source_v , source,AcceleratorRead);
autoView( Determinant_v , Determinant,AcceleratorWrite);
accelerator_for(ss, grid->oSites(), 1, {
subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0); subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1); subgroup_v[ss]()()(0, 1) = source_v[ss]()()(i0, i1);
@ -257,15 +257,16 @@ public:
su2SubGroupIndex(i0, i1, su2_index); su2SubGroupIndex(i0, i1, su2_index);
dest = 1.0; // start out with identity dest = 1.0; // start out with identity
auto dest_v = dest.View(); autoView( dest_v , dest, AcceleratorWrite);
auto subgroup_v = subgroup.View(); autoView( subgroup_v, subgroup, AcceleratorRead);
thread_for(ss, grid->oSites(), accelerator_for(ss, grid->oSites(),1,
{ {
dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0); dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1); dest_v[ss]()()(i0, i1) = subgroup_v[ss]()()(0, 1);
dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0); dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1); dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
}); });
} }
/////////////////////////////////////////////// ///////////////////////////////////////////////
@ -608,8 +609,8 @@ public:
// reunitarise?? // reunitarise??
template <typename LatticeMatrixType> template <typename LatticeMatrixType>
static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0)
double scale = 1.0) { {
GridBase *grid = out.Grid(); GridBase *grid = out.Grid();
typedef typename LatticeMatrixType::vector_type vector_type; typedef typename LatticeMatrixType::vector_type vector_type;
@ -618,8 +619,7 @@ public:
typedef iSinglet<vector_type> vTComplexType; typedef iSinglet<vector_type> vTComplexType;
typedef Lattice<vTComplexType> LatticeComplexType; typedef Lattice<vTComplexType> LatticeComplexType;
typedef typename GridTypeMapper< typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
LatticeComplexType ca(grid); LatticeComplexType ca(grid);
LatticeMatrixType lie(grid); LatticeMatrixType lie(grid);
@ -629,6 +629,7 @@ public:
MatrixType ta; MatrixType ta;
lie = Zero(); lie = Zero();
for (int a = 0; a < AdjointDimension; a++) { for (int a = 0; a < AdjointDimension; a++) {
random(pRNG, ca); random(pRNG, ca);
@ -640,6 +641,7 @@ public:
la = ci * ca * ta; la = ci * ca * ta;
lie = lie + la; // e^{i la ta} lie = lie + la; // e^{i la ta}
} }
taExp(lie, out); taExp(lie, out);
} }

View File

@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "BinaryIO.h" #include "BinaryIO.h"
#include "TextIO.h" #include "TextIO.h"
#include "XmlIO.h" #include "XmlIO.h"
#ifndef GRID_NVCC #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
#include "JSON_IO.h" #include "JSON_IO.h"
#endif #endif

View File

@ -32,7 +32,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*/ */
//---------------------------------------------------------------------- //----------------------------------------------------------------------
#ifdef GRID_CUDA
#include <cuda_fp16.h> #include <cuda_fp16.h>
#endif
#ifdef GRID_HIP
#include <hip/hip_fp16.h>
#endif
namespace Grid { namespace Grid {
@ -142,7 +147,7 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
accelerator_inline float half2float(half h) accelerator_inline float half2float(half h)
{ {
float f; float f;
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
f = __half2float(h); f = __half2float(h);
#else #else
//f = __half2float(h); //f = __half2float(h);
@ -156,7 +161,7 @@ accelerator_inline float half2float(half h)
accelerator_inline half float2half(float f) accelerator_inline half float2half(float f)
{ {
half h; half h;
#ifdef __CUDA_ARCH__ #ifdef GRID_SIMT
h = __float2half(f); h = __float2half(f);
#else #else
Grid_half hh = sfw_float_to_half(f); Grid_half hh = sfw_float_to_half(f);

View File

@ -31,7 +31,7 @@ directory
#ifndef GRID_SIMD_H #ifndef GRID_SIMD_H
#define GRID_SIMD_H #define GRID_SIMD_H
#ifdef GRID_NVCC #if defined(GRID_CUDA) || defined(GRID_HIP)
#include <thrust/complex.h> #include <thrust/complex.h>
#endif #endif
@ -65,7 +65,7 @@ typedef RealD Real;
typedef RealF Real; typedef RealF Real;
#endif #endif
#ifdef GRID_NVCC #if defined(GRID_CUDA) || defined(GRID_HIP)
typedef thrust::complex<RealF> ComplexF; typedef thrust::complex<RealF> ComplexF;
typedef thrust::complex<RealD> ComplexD; typedef thrust::complex<RealD> ComplexD;
typedef thrust::complex<Real> Complex; typedef thrust::complex<Real> Complex;

View File

@ -67,7 +67,8 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
{ {
int num=table.size(); int num=table.size();
std::pair<int,int> *table_v = & table[0]; std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View();
auto rhs_v = rhs.View(AcceleratorRead);
accelerator_forNB( i,num, vobj::Nsimd(), { accelerator_forNB( i,num, vobj::Nsimd(), {
typedef decltype(coalescedRead(buffer[0])) compressed_t; typedef decltype(coalescedRead(buffer[0])) compressed_t;
compressed_t tmp_c; compressed_t tmp_c;
@ -75,6 +76,7 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
coalescedWrite(buffer[off+o],tmp_c); coalescedWrite(buffer[off+o],tmp_c);
}); });
rhs_v.ViewClose();
// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table // Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
} }
@ -94,7 +96,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
int num=table.size()/2; int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(); auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0]; auto p0=&pointers[0][0];
auto p1=&pointers[1][0]; auto p1=&pointers[1][0];
auto tp=&table[0]; auto tp=&table[0];
@ -104,10 +106,11 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
so+tp[2*j+1].second, so+tp[2*j+1].second,
type); type);
}); });
rhs_v.ViewClose();
} }
struct StencilEntry { struct StencilEntry {
#ifdef GRID_NVCC #ifdef GRID_CUDA
uint64_t _byte_offset; // 8 bytes uint64_t _byte_offset; // 8 bytes
uint32_t _offset; // 4 bytes uint32_t _offset; // 4 bytes
#else #else
@ -122,7 +125,7 @@ struct StencilEntry {
// Could pack to 8 + 4 + 4 = 128 bit and use // Could pack to 8 + 4 + 4 = 128 bit and use
template<class vobj,class cobj,class Parameters> template<class vobj,class cobj,class Parameters>
class CartesianStencilView { class CartesianStencilAccelerator {
public: public:
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector; typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
@ -130,14 +133,15 @@ class CartesianStencilView {
//////////////////////////////////////// ////////////////////////////////////////
// Basic Grid and stencil info // Basic Grid and stencil info
//////////////////////////////////////// ////////////////////////////////////////
int _checkerboard; int _checkerboard;
int _npoints; // Move to template param? int _npoints; // Move to template param?
int _osites;
StencilVector _directions; StencilVector _directions;
StencilVector _distances; StencilVector _distances;
StencilVector _comm_buf_size; StencilVector _comm_buf_size;
StencilVector _permute_type; StencilVector _permute_type;
StencilVector same_node; StencilVector same_node;
Coordinate _simd_layout; Coordinate _simd_layout;
Parameters parameters; Parameters parameters;
StencilEntry* _entries_p; StencilEntry* _entries_p;
cobj* u_recv_buf_p; cobj* u_recv_buf_p;
@ -175,13 +179,43 @@ class CartesianStencilView {
{ {
Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout);
} }
};
template<class vobj,class cobj,class Parameters>
class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters>
{
private:
int *closed;
StencilEntry *cpu_ptr;
ViewMode mode;
public:
// default copy constructor
CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
cpu_ptr(this->_entries_p),
mode(_mode)
{
this->_entries_p =(StencilEntry *)
MemoryManager::ViewOpen(this->_entries_p,
this->_npoints*this->_osites*sizeof(StencilEntry),
mode,
AdviseDefault);
}
void ViewClose(void)
{
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
}; };
//////////////////////////////////////// ////////////////////////////////////////
// The Stencil Class itself // The Stencil Class itself
//////////////////////////////////////// ////////////////////////////////////////
template<class vobj,class cobj,class Parameters> template<class vobj,class cobj,class Parameters>
class CartesianStencil : public CartesianStencilView<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in. class CartesianStencil : public CartesianStencilAccelerator<vobj,cobj,Parameters> { // Stencil runs along coordinate axes only; NO diagonal fill in.
public: public:
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
@ -226,8 +260,8 @@ public:
// Generalise as required later if needed // Generalise as required later if needed
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
View_type View(void) const { View_type View(ViewMode mode) const {
View_type accessor(*( (View_type *) this)); View_type accessor(*( (View_type *) this),mode);
return accessor; return accessor;
} }
@ -662,9 +696,9 @@ public:
_unified_buffer_size=0; _unified_buffer_size=0;
surface_list.resize(0); surface_list.resize(0);
int osites = _grid->oSites(); this->_osites = _grid->oSites();
_entries.resize(this->_npoints* osites); _entries.resize(this->_npoints* this->_osites);
this->_entries_p = &_entries[0]; this->_entries_p = &_entries[0];
for(int ii=0;ii<npoints;ii++){ for(int ii=0;ii<npoints;ii++){

View File

@ -31,22 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
//accelerator_inline void SIMTsynchronise(void)
accelerator_inline void synchronise(void)
{
#ifdef __CUDA_ARCH__
// __syncthreads();
__syncwarp();
#endif
return;
}
#ifndef __CUDA_ARCH__ #ifndef GRID_SIMT
////////////////////////////////////////// //////////////////////////////////////////
// Trivial mapping of vectors on host // Trivial mapping of vectors on host
////////////////////////////////////////// //////////////////////////////////////////
accelerator_inline int SIMTlane(int Nsimd) { return 0; } // CUDA specific
template<class vobj> accelerator_inline template<class vobj> accelerator_inline
vobj coalescedRead(const vobj & __restrict__ vec,int lane=0) vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
{ {
@ -66,7 +55,6 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
template<class vobj> accelerator_inline template<class vobj> accelerator_inline
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0) void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
{ {
// vstream(vec, extracted);
vec = extracted; vec = extracted;
} }
template<class vobj> accelerator_inline template<class vobj> accelerator_inline
@ -75,25 +63,24 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
vstream(vec, extracted); vstream(vec, extracted);
} }
#else #else
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
////////////////////////////////////////// //////////////////////////////////////////
// Extract and insert slices on the GPU // Extract and insert slices on the GPU
////////////////////////////////////////// //////////////////////////////////////////
template<class vobj> accelerator_inline template<class vobj> accelerator_inline
typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=SIMTlane(vobj::Nsimd())) typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{ {
return extractLane(lane,vec); return extractLane(lane,vec);
} }
template<class vobj> accelerator_inline template<class vobj> accelerator_inline
typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=SIMTlane(vobj::Nsimd())) typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{ {
int mask = vobj::Nsimd() >> (ptype + 1); int mask = vobj::Nsimd() >> (ptype + 1);
int plane= doperm ? lane ^ mask : lane; int plane= doperm ? lane ^ mask : lane;
return extractLane(plane,vec); return extractLane(plane,vec);
} }
template<class vobj> accelerator_inline template<class vobj> accelerator_inline
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=SIMTlane(vobj::Nsimd())) void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
{ {
insertLane(lane,vec,extracted); insertLane(lane,vec,extracted);
} }

View File

@ -59,6 +59,20 @@ class GridTensorBase {};
using DoublePrecision2= typename Traits::DoublePrecision2; \ using DoublePrecision2= typename Traits::DoublePrecision2; \
static constexpr int TensorLevel = Traits::TensorLevel static constexpr int TensorLevel = Traits::TensorLevel
///////////////////////////////////////////////////////////
// Allows to turn scalar<scalar<scalar<double>>>> back to double.
///////////////////////////////////////////////////////////
template <class T>
accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
TensorRemove(T arg) {
return arg;
}
template <class vtype>
accelerator_inline auto TensorRemove(iScalar<vtype> arg)
-> decltype(TensorRemove(arg._internal)) {
return TensorRemove(arg._internal);
}
template <class vtype> template <class vtype>
class iScalar { class iScalar {
public: public:
@ -135,9 +149,10 @@ public:
operator ComplexD() const { operator ComplexD() const {
return (TensorRemove(_internal)); return (TensorRemove(_internal));
} }
// instantiation of "Grid::iScalar<vtype>::operator Grid::RealD() const [with vtype=Grid::Real, U=Grid::Real, V=Grid::RealD, <unnamed>=0, <unnamed>=0U]"
template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> accelerator_inline template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> accelerator_inline
operator RealD() const { operator RealD() const {
return TensorRemove(_internal); return (RealD) TensorRemove(_internal);
} }
template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> accelerator_inline template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> accelerator_inline
operator Integer() const { operator Integer() const {
@ -169,20 +184,6 @@ public:
strong_inline scalar_type * end() { return begin() + Traits::count; } strong_inline scalar_type * end() { return begin() + Traits::count; }
}; };
///////////////////////////////////////////////////////////
// Allows to turn scalar<scalar<scalar<double>>>> back to double.
///////////////////////////////////////////////////////////
template <class T>
accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
TensorRemove(T arg) {
return arg;
}
template <class vtype>
accelerator_inline auto TensorRemove(iScalar<vtype> arg)
-> decltype(TensorRemove(arg._internal)) {
return TensorRemove(arg._internal);
}
template <class vtype, int N> template <class vtype, int N>
class iVector { class iVector {
public: public:

View File

@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
// Specialisation: Cayley-Hamilton exponential for SU(3) // Specialisation: Cayley-Hamilton exponential for SU(3)
#ifndef GRID_NVCC #ifndef GRID_CUDA
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP ) accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
{ {

207
Grid/threads/Accelerator.cc Normal file
View File

@ -0,0 +1,207 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
uint32_t accelerator_threads=2;
uint32_t acceleratorThreads(void) {return accelerator_threads;};
void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#ifdef GRID_CUDA
cudaDeviceProp *gpu_props;
void acceleratorInit(void)
{
int nDevices = 1;
cudaGetDeviceCount(&nDevices);
gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
size_t totalDeviceMem=0;
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
cudaDeviceProp prop;
prop = gpu_props[i];
totalDeviceMem = prop.totalGlobalMem;
if ( world_rank == 0) {
printf("AcceleratorCudaInit: ========================\n");
printf("AcceleratorCudaInit: Device Number : %d\n", i);
printf("AcceleratorCudaInit: ========================\n");
printf("AcceleratorCudaInit: Device identifier: %s\n", prop.name);
GPU_PROP_FMT(totalGlobalMem,"%lld");
GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("AcceleratorCudaInit: setting device to node rank\n");
cudaSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
}
#endif
#ifdef GRID_HIP
hipDeviceProp_t *gpu_props;
void acceleratorInit(void)
{
int nDevices = 1;
hipGetDeviceCount(&nDevices);
gpu_props = new hipDeviceProp_t[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
hipGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
hipDeviceProp_t prop;
prop = gpu_props[i];
printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device Number : %d\n", i);
printf("AcceleratorHipInit: ========================\n");
printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
// GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
#undef GPU_PROP_FMT
#undef GPU_PROP
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n");
#else
if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n");
hipSetDevice(rank);
#endif
if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
}
#endif
#ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator;
void acceleratorInit(void)
{
int nDevices = 1;
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
auto devices = cl::sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){
#define GPU_PROP_STR(prop) \
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
#define GPU_PROP_FMT(prop,FMT) \
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
GPU_PROP_STR(vendor);
GPU_PROP_STR(version);
// GPU_PROP_STR(device_type);
/*
GPU_PROP(max_compute_units);
GPU_PROP(native_vector_width_char);
GPU_PROP(native_vector_width_short);
GPU_PROP(native_vector_width_int);
GPU_PROP(native_vector_width_long);
GPU_PROP(native_vector_width_float);
GPU_PROP(native_vector_width_double);
GPU_PROP(native_vector_width_half);
GPU_PROP(address_bits);
GPU_PROP(half_fp_config);
GPU_PROP(single_fp_config);
*/
// GPU_PROP(double_fp_config);
GPU_PROP(global_mem_size);
}
if ( world_rank == 0 ) {
auto name = theGridAccelerator->get_device().get_info<sycl::info::device::name>();
printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str());
printf("AcceleratorSyclInit: ================================================\n");
}
}
#endif
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
void acceleratorInit(void){}
#endif
NAMESPACE_END(Grid);

426
Grid/threads/Accelerator.h Normal file
View File

@ -0,0 +1,426 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Accelerator.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <string.h>
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////
// Accelerator primitives; fall back to threading if not CUDA or SYCL
//////////////////////////////////////////////////////////////////////////////////
//
// Function attributes
//
// accelerator
// accelerator_inline
//
// Parallel looping
//
// accelerator_for
// accelerator_forNB
// uint32_t accelerator_barrier(); // device synchronise
//
// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
//
// uint32_t acceleratorThreads(void);
// void acceleratorThreads(uint32_t);
//
// Warp control and info:
//
// acceleratorInit;
// void acceleratorSynchronise(void); // synch warp etc..
// int acceleratorSIMTlane(int Nsimd);
//
// Memory management:
//
// void *acceleratorAllocShared(size_t bytes);
// void acceleratorFreeShared(void *ptr);
//
// void *acceleratorAllocDevice(size_t bytes);
// void acceleratorFreeDevice(void *ptr);
//
// void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
// void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
//
//////////////////////////////////////////////////////////////////////////////////
uint32_t acceleratorThreads(void);
void acceleratorThreads(uint32_t);
void acceleratorInit(void);
//////////////////////////////////////////////
// CUDA acceleration
//////////////////////////////////////////////
#ifdef GRID_CUDA
#ifdef __CUDA_ARCH__
#define GRID_SIMT
#endif
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return threadIdx.z;
#else
return 0;
#endif
} // CUDA specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
int nt=acceleratorThreads(); \
dim3 cu_threads(acceleratorThreads(),1,nsimd); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads>>>(num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{
uint64_t x = threadIdx.x + blockDim.x*blockIdx.x;
uint64_t y = threadIdx.y + blockDim.y*blockIdx.y;
uint64_t z = threadIdx.z;
if ( (x < num1) && (y<num2) && (z<num3) ) {
Lambda(x,y,z);
}
}
#define accelerator_barrier(dummy) \
{ \
cudaDeviceSynchronize(); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("Cuda error %s \n", cudaGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
} \
}
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMalloc((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
#endif
//////////////////////////////////////////////
// SyCL acceleration
//////////////////////////////////////////////
#ifdef GRID_SYCL
NAMESPACE_END(Grid);
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
NAMESPACE_BEGIN(Grid);
extern cl::sycl::queue *theGridAccelerator;
#ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT
#endif
#define accelerator
#define accelerator_inline strong_inline
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2];
#else
return 0;
#endif
} // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
unsigned long nt=acceleratorThreads(); \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
cgh.parallel_for<class dslash>( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) mutable { \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
auto lane = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
#define accelerator_barrier(dummy) theGridAccelerator->wait();
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();}
#endif
//////////////////////////////////////////////
// HIP acceleration
//////////////////////////////////////////////
#ifdef GRID_HIP
NAMESPACE_END(Grid);
#include <hip/hip_runtime.h>
NAMESPACE_BEGIN(Grid);
#ifdef __HIP_DEVICE_COMPILE__
#define GRID_SIMT
#endif
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
/*These routines define mapping from thread grid to loop & vector lane indexing */
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return hipThreadIdx_z;
#else
return 0;
#endif
} // HIP specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane ) mutable { \
{ __VA_ARGS__;} \
}; \
int nt=acceleratorThreads(); \
dim3 hip_threads(nt,1,nsimd); \
dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
0,0, \
num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
{
uint64_t x = hipThreadIdx_x + hipBlockDim_x*hipBlockIdx_x;
uint64_t y = hipThreadIdx_y + hipBlockDim_y*hipBlockIdx_y;
uint64_t z = hipThreadIdx_z ;//+ hipBlockDim_z*hipBlockIdx_z;
if ( (x < numx) && (y<numy) && (z<numz) ) {
Lambda(x,y,z);
}
}
#define accelerator_barrier(dummy) \
{ \
hipDeviceSynchronize(); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
inline void *acceleratorAllocShared(size_t bytes)
{
#if 0
void *ptr=NULL;
auto err = hipMallocManaged((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
}
return ptr;
#else
return malloc(bytes);
#endif
};
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ free(ptr);};
inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
#endif
//////////////////////////////////////////////
// Common on all GPU targets
//////////////////////////////////////////////
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
#define accelerator_for( iter, num, nsimd, ... ) \
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#endif
//////////////////////////////////////////////
// CPU Target - No accelerator just thread instead
//////////////////////////////////////////////
#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
#undef GRID_SIMT
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_barrier(dummy)
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);}
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
#else
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void acceleratorFreeShared(void *ptr){free(ptr);};
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
#endif
#endif // CPU target
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocCpu(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeCpu (void *ptr){_mm_free(ptr);};
#else
inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
inline void acceleratorFreeCpu (void *ptr){free(ptr);};
#endif
///////////////////////////////////////////////////
// Synchronise across local threads for divergence resynch
///////////////////////////////////////////////////
accelerator_inline void acceleratorSynchronise(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__syncwarp();
#endif
#ifdef GRID_SYCL
// No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier
#endif
#ifdef GRID_HIP
__syncthreads();
#endif
#endif
return;
}
accelerator_inline void acceleratorSynchroniseAll(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__syncthreads();
#endif
#ifdef GRID_SYCL
// No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier
#endif
#ifdef GRID_HIP
__syncthreads();
#endif
#endif
return;
}
accelerator_inline void acceleratorFence(void)
{
#ifdef GRID_SIMT
#ifdef GRID_CUDA
__threadfence();
#endif
#ifdef GRID_SYCL
// FIXMEE
#endif
#ifdef GRID_HIP
__threadfence();
#endif
#endif
return;
}
NAMESPACE_END(Grid);

View File

@ -2,7 +2,7 @@
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Threads.h Source file: ./lib/Pragmas.h
Copyright (C) 2015 Copyright (C) 2015
@ -28,107 +28,5 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#ifndef MAX #include <Grid/threads/Threads.h>
#define MAX(x,y) ((x)>(y)?(x):(y)) #include <Grid/threads/Accelerator.h>
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
#define strong_inline __attribute__((always_inline)) inline
#define UNROLL _Pragma("unroll")
//////////////////////////////////////////////////////////////////////////////////
// New primitives; explicit host thread calls, and accelerator data parallel calls
//////////////////////////////////////////////////////////////////////////////////
#ifdef _OPENMP
#define GRID_OMP
#include <omp.h>
#endif
#ifdef GRID_OMP
#define DO_PRAGMA_(x) _Pragma (#x)
#define DO_PRAGMA(x) DO_PRAGMA_(x)
#define thread_num(a) omp_get_thread_num()
#define thread_max(a) omp_get_max_threads()
#else
#define DO_PRAGMA_(x)
#define DO_PRAGMA(x)
#define thread_num(a) (0)
#define thread_max(a) (1)
#endif
#define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
#define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse( N , i, num, ... ) DO_PRAGMA(omp parallel for collapse ( N ) ) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_for_collapse_in_region( N , i, num, ... ) DO_PRAGMA(omp for collapse ( N )) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define thread_region DO_PRAGMA(omp parallel)
#define thread_critical DO_PRAGMA(omp critical)
//////////////////////////////////////////////////////////////////////////////////
// Accelerator primitives; fall back to threading
//////////////////////////////////////////////////////////////////////////////////
#ifdef __NVCC__
#define GRID_NVCC
#endif
#ifdef GRID_NVCC
extern uint32_t gpu_threads;
#define accelerator __host__ __device__
#define accelerator_inline __host__ __device__ inline
template<typename lambda> __global__
void LambdaApplySIMT(uint64_t Isites, uint64_t Osites, lambda Lambda)
{
uint64_t isite = threadIdx.y;
uint64_t osite = threadIdx.x+blockDim.x*blockIdx.x;
if ( (osite <Osites) && (isite<Isites) ) {
Lambda(isite,osite);
}
}
/////////////////////////////////////////////////////////////////
// Internal only really... but need to call when
/////////////////////////////////////////////////////////////////
#define accelerator_barrier(dummy) \
{ \
cudaDeviceSynchronize(); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("Cuda error %s \n", cudaGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
// Copy the for_each_n style ; Non-blocking variant
#define accelerator_forNB( iterator, num, nsimd, ... ) \
{ \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator (Iterator lane,Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(gpu_threads,nsimd); \
dim3 cu_blocks ((num+gpu_threads-1)/gpu_threads); \
LambdaApplySIMT<<<cu_blocks,cu_threads>>>(nsimd,num,lambda); \
}
// Copy the for_each_n style ; Non-blocking variant (default
#define accelerator_for( iterator, num, nsimd, ... ) \
accelerator_forNB(iterator, num, nsimd, { __VA_ARGS__ } ); \
accelerator_barrier(dummy);
#else
#define accelerator
#define accelerator_inline strong_inline
#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
#define accelerator_barrier(dummy)
#endif

View File

@ -0,0 +1,127 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ThreadReduction.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
// Introduce a class to gain deterministic bit reproducible reduction.
// make static; perhaps just a namespace is required.
NAMESPACE_BEGIN(Grid);
class GridThread {
public:
static int _threads;
static int _hyperthreads;
static int _cores;
static void SetCores(int cr) {
#ifdef GRID_OMP
_cores = cr;
#else
_cores = 1;
#endif
}
static void SetThreads(int thr) {
#ifdef GRID_OMP
_threads = MIN(thr,omp_get_max_threads()) ;
omp_set_num_threads(_threads);
#else
_threads = 1;
#endif
};
static void SetMaxThreads(void) {
#ifdef GRID_OMP
_threads = omp_get_max_threads();
omp_set_num_threads(_threads);
#else
_threads = 1;
#endif
};
static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
static int GetCores(void) { return _cores; };
static int GetThreads(void) { return _threads; };
static int SumArraySize(void) {return _threads;};
static void GetWork(int nwork, int me, int & mywork, int & myoff){
GetWork(nwork,me,mywork,myoff,_threads);
}
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
int basework = nwork/units;
int backfill = units-(nwork%units);
if ( me >= units ) {
mywork = myoff = 0;
} else {
mywork = (nwork+me)/units;
myoff = basework * me;
if ( me > backfill )
myoff+= (me-backfill);
}
return;
};
static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
me = ThreadBarrier();
GetWork(nwork,me,mywork,myoff);
};
static int ThreadBarrier(void) {
#ifdef GRID_OMP
#pragma omp barrier
return omp_get_thread_num();
#else
return 0;
#endif
};
template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
sum_array[me] = val;
val=Zero();
ThreadBarrier();
for(int i=0;i<_threads;i++) val+= sum_array[i];
ThreadBarrier();
}
static void bcopy(const void *src, void *dst, size_t len) {
#ifdef GRID_OMP
#pragma omp parallel
{
const char *c_src =(char *) src;
char *c_dest=(char *) dst;
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
}
#else
bcopy(src,dst,len);
#endif
}
};
NAMESPACE_END(Grid);

View File

@ -28,101 +28,47 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#ifndef MAX
// Introduce a class to gain deterministic bit reproducible reduction. #define MAX(x,y) ((x)>(y)?(x):(y))
// make static; perhaps just a namespace is required. #define MIN(x,y) ((x)>(y)?(y):(x))
NAMESPACE_BEGIN(Grid);
class GridThread {
public:
static int _threads;
static int _hyperthreads;
static int _cores;
static void SetCores(int cr) {
#ifdef GRID_OMP
_cores = cr;
#else
_cores = 1;
#endif #endif
}
static void SetThreads(int thr) { #define strong_inline __attribute__((always_inline)) inline
#ifdef GRID_OMP #define UNROLL _Pragma("unroll")
_threads = MIN(thr,omp_get_max_threads()) ;
omp_set_num_threads(_threads); //////////////////////////////////////////////////////////////////////////////////
#else // New primitives; explicit host thread calls, and accelerator data parallel calls
_threads = 1; //////////////////////////////////////////////////////////////////////////////////
#ifdef _OPENMP
#define GRID_OMP
#include <omp.h>
#endif #endif
};
static void SetMaxThreads(void) {
#ifdef GRID_OMP #ifdef GRID_OMP
_threads = omp_get_max_threads(); #define DO_PRAGMA_(x) _Pragma (#x)
omp_set_num_threads(_threads); #define DO_PRAGMA(x) DO_PRAGMA_(x)
#define thread_num(a) omp_get_thread_num()
#define thread_max(a) omp_get_max_threads()
#else #else
_threads = 1; #define DO_PRAGMA_(x)
#define DO_PRAGMA(x)
#define thread_num(a) (0)
#define thread_max(a) (1)
#endif #endif
};
static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
static int GetCores(void) { return _cores; };
static int GetThreads(void) { return _threads; };
static int SumArraySize(void) {return _threads;};
static void GetWork(int nwork, int me, int & mywork, int & myoff){ #define thread_for( i, num, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
GetWork(nwork,me,mywork,myoff,_threads); #define thread_for2d( i1, n1,i2,n2, ... ) \
} DO_PRAGMA(omp parallel for collapse(2)) \
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){ for ( uint64_t i1=0;i1<n1;i1++) { \
int basework = nwork/units; for ( uint64_t i2=0;i2<n2;i2++) { \
int backfill = units-(nwork%units); { __VA_ARGS__ } ; \
if ( me >= units ) { }}
mywork = myoff = 0; #define thread_foreach( i, container, ... ) DO_PRAGMA(omp parallel for schedule(static)) for ( uint64_t i=container.begin();i<container.end();i++) { __VA_ARGS__ } ;
} else { #define thread_for_in_region( i, num, ... ) DO_PRAGMA(omp for schedule(static)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
mywork = (nwork+me)/units; #define thread_for_collapse2( i, num, ... ) DO_PRAGMA(omp parallel for collapse(2)) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
myoff = basework * me; #define thread_for_collapse( N , i, num, ... ) DO_PRAGMA(omp parallel for collapse ( N ) ) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
if ( me > backfill ) #define thread_for_collapse_in_region( N , i, num, ... ) DO_PRAGMA(omp for collapse ( N )) for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
myoff+= (me-backfill); #define thread_region DO_PRAGMA(omp parallel)
} #define thread_critical DO_PRAGMA(omp critical)
return;
};
static void GetWorkBarrier(int nwork, int &me, int & mywork, int & myoff){
me = ThreadBarrier();
GetWork(nwork,me,mywork,myoff);
};
static int ThreadBarrier(void) {
#ifdef GRID_OMP
#pragma omp barrier
return omp_get_thread_num();
#else
return 0;
#endif
};
template<class obj> static void ThreadSum( std::vector<obj> &sum_array,obj &val,int me){
sum_array[me] = val;
val=Zero();
ThreadBarrier();
for(int i=0;i<_threads;i++) val+= sum_array[i];
ThreadBarrier();
}
static void bcopy(const void *src, void *dst, size_t len) {
#ifdef GRID_OMP
#pragma omp parallel
{
const char *c_src =(char *) src;
char *c_dest=(char *) dst;
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
}
#else
bcopy(src,dst,len);
#endif
}
};
NAMESPACE_END(Grid);

View File

@ -52,14 +52,14 @@ public:
accelerator_inline size_type size(void) const { return _size; }; accelerator_inline size_type size(void) const { return _size; };
accelerator_inline void clear(void) { resize(0);} accelerator_inline void clear(void) { resize(0);}
accelerator_inline void resize(size_type sz) { accelerator_inline void resize(size_type sz) {
#ifndef GRID_HIP
assert(sz>=0); assert(sz>=0);
assert(sz<=MaxEntries); assert(sz<=MaxEntries);
#endif
_size = sz; _size = sz;
} }
accelerator_inline void resize(size_type sz,const value &val) { accelerator_inline void resize(size_type sz,const value &val) {
assert(sz>=0); resize(sz);
assert(sz<=MaxEntries);
_size = sz;
for(int s=0;s<sz;s++) _data[s]=val; for(int s=0;s<sz;s++) _data[s]=val;
} }
accelerator_inline pointer begin(void) { return &_data[0]; } accelerator_inline pointer begin(void) { return &_data[0]; }
@ -67,7 +67,7 @@ public:
accelerator_inline pointer end (void) { return &_data[_size]; } accelerator_inline pointer end (void) { return &_data[_size]; }
accelerator_inline const_pointer end (void) const { return &_data[_size]; } accelerator_inline const_pointer end (void) const { return &_data[_size]; }
accelerator_inline void push_back(const value &val) { resize(_size+1); _data[_size-1] = val;} accelerator_inline void push_back(const value &val) { resize(_size+1); _data[_size-1] = val;}
accelerator_inline AcceleratorVector() { _size = 0; } accelerator_inline AcceleratorVector() { resize(0); }
accelerator_inline AcceleratorVector(size_type sz) { resize(sz); } accelerator_inline AcceleratorVector(size_type sz) { resize(sz); }
accelerator_inline AcceleratorVector(size_type sz,const value &val) { resize(sz,val); } accelerator_inline AcceleratorVector(size_type sz,const value &val) { resize(sz,val); }
AcceleratorVector(const std::vector<value> &copyme) { AcceleratorVector(const std::vector<value> &copyme) {

View File

@ -73,8 +73,6 @@ feenableexcept (unsigned int excepts)
} }
#endif #endif
uint32_t gpu_threads=8;
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -192,16 +190,12 @@ void GridParseLayout(char **argv,int argc,
assert(ompthreads.size()==1); assert(ompthreads.size()==1);
GridThread::SetThreads(ompthreads[0]); GridThread::SetThreads(ompthreads[0]);
} }
if( GridCmdOptionExists(argv,argv+argc,"--gpu-threads") ){ if( GridCmdOptionExists(argv,argv+argc,"--accelerator-threads") ){
std::vector<int> gputhreads(0); std::vector<int> gputhreads(0);
#ifndef GRID_NVCC arg= GridCmdOptionPayload(argv,argv+argc,"--accelerator-threads");
std::cout << GridLogWarning << "'--gpu-threads' option used but Grid was"
<< " not compiled with GPU support" << std::endl;
#endif
arg= GridCmdOptionPayload(argv,argv+argc,"--gpu-threads");
GridCmdOptionIntVector(arg,gputhreads); GridCmdOptionIntVector(arg,gputhreads);
assert(gputhreads.size()==1); assert(gputhreads.size()==1);
gpu_threads=gputhreads[0]; acceleratorThreads(gputhreads[0]);
} }
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
@ -241,8 +235,6 @@ static int Grid_is_initialised;
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
void GridBanner(void) void GridBanner(void)
{ {
static int printed =0;
if( !printed ) {
std::cout <<std::endl; std::cout <<std::endl;
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl; std::cout << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;
@ -278,67 +270,6 @@ void GridBanner(void)
std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl; std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
#endif #endif
std::cout << std::endl; std::cout << std::endl;
printed=1;
}
}
#ifdef GRID_NVCC
cudaDeviceProp *gpu_props;
#endif
void GridGpuInit(void)
{
#ifdef GRID_NVCC
int nDevices = 1;
cudaGetDeviceCount(&nDevices);
gpu_props = new cudaDeviceProp[nDevices];
char * localRankStr = NULL;
int rank = 0, world_rank=0;
#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
// We extract the local rank initialization using an environment variable
if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
{
rank = atoi(localRankStr);
}
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
if ( world_rank == 0 ) {
GridBanner();
}
for (int i = 0; i < nDevices; i++) {
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("GpuInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
cudaGetDeviceProperties(&gpu_props[i], i);
if ( world_rank == 0) {
cudaDeviceProp prop;
prop = gpu_props[i];
printf("GpuInit: ========================\n");
printf("GpuInit: Device Number : %d\n", i);
printf("GpuInit: ========================\n");
printf("GpuInit: Device identifier: %s\n", prop.name);
GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
// GPU_PROP(unifiedAddressing);
// GPU_PROP(l2CacheSize);
// GPU_PROP(singleToDoublePrecisionPerfRatio);
}
}
if ( world_rank == 0 ) {
printf("GpuInit: ================================================\n");
}
#endif
} }
void Grid_init(int *argc,char ***argv) void Grid_init(int *argc,char ***argv)
@ -353,9 +284,7 @@ void Grid_init(int *argc,char ***argv)
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Early intialisation necessities without rank knowledge // Early intialisation necessities without rank knowledge
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
GridGpuInit(); // Must come first to set device prior to MPI init acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver
PointerCache::Init();
if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
int MB; int MB;
@ -365,6 +294,14 @@ void Grid_init(int *argc,char ***argv)
GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL; GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){
int MB;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem");
GridCmdOptionInt(arg,MB);
uint64_t MB64 = MB;
MemoryManager::DeviceMaxBytes = MB64*1024LL*1024LL;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--hypercube") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--hypercube") ){
int enable; int enable;
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--hypercube"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--hypercube");
@ -381,6 +318,11 @@ void Grid_init(int *argc,char ***argv)
Grid_debug_handler_init(); Grid_debug_handler_init();
} }
//////////////////////////////////////////////////////////
// Memory manager
//////////////////////////////////////////////////////////
MemoryManager::Init();
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// MPI initialisation // MPI initialisation
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
@ -419,11 +361,18 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl; std::cout << GridLogMessage << "================================================ "<<std::endl;
/////////////////////////////////////////////////////////
// Reporting
/////////////////////////////////////////////////////////
std::cout << GridLogMessage << "Requested "<< GlobalSharedMemory::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; std::cout << GridLogMessage << "Requested "<< GlobalSharedMemory::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
if ( GlobalSharedMemory::Hugepages) { if ( GlobalSharedMemory::Hugepages) {
std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl; std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
} }
#ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-mem") ){
MemoryProfiler::debug = true; MemoryProfiler::debug = true;

View File

@ -237,9 +237,9 @@ public:
Vec rn ; random(sRNG,rn); Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=rn; LatticeVec z(&Grid); z=Zero();
LatticeVec x(&Grid); x=rn; LatticeVec x(&Grid); x=Zero();
LatticeVec y(&Grid); y=rn; LatticeVec y(&Grid); y=Zero();
double a=2.0; double a=2.0;
uint64_t Nloop=NLOOP; uint64_t Nloop=NLOOP;
@ -247,9 +247,9 @@ public:
double start=usecond(); double start=usecond();
for(int i=0;i<Nloop;i++){ for(int i=0;i<Nloop;i++){
z=a*x-y; z=a*x-y;
auto x_v = x.View(); autoView( x_v , x, CpuWrite);
auto y_v = y.View(); autoView( y_v , y, CpuWrite);
auto z_v = z.View(); autoView( z_v , z, CpuRead);
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
y_v[4]=z_v[4]; y_v[4]=z_v[4];
} }

View File

@ -21,7 +21,7 @@
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/Grid.h> #include <Grid/Grid.h>
#ifdef GRID_NVCC #ifdef GRID_CUDA
#define CUDA_PROFILE #define CUDA_PROFILE
#endif #endif
@ -129,8 +129,8 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu5d(FGrid); LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid); std::vector<LatticeColourMatrix> U(4,FGrid);
{ {
auto Umu5d_v = Umu5d.View(); autoView( Umu5d_v, Umu5d, CpuWrite);
auto Umu_v = Umu.View(); autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){ for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss]; Umu5d_v[Ls*ss+s] = Umu_v[ss];
@ -258,8 +258,8 @@ int main (int argc, char ** argv)
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x // ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1); tmp = U[mu]*Cshift(src,mu+1,1);
{ {
auto ref_v = ref.View(); autoView( ref_v, ref, CpuWrite);
auto tmp_v = tmp.View(); autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){ for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ; ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
} }
@ -268,8 +268,8 @@ int main (int argc, char ** argv)
tmp =adj(U[mu])*src; tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1); tmp =Cshift(tmp,mu+1,-1);
{ {
auto ref_v = ref.View(); autoView( ref_v, ref, CpuWrite);
auto tmp_v = tmp.View(); autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){ for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ; ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
} }

View File

@ -130,11 +130,13 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
LatticeGaugeField Umu5d(FGrid); LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension // replicate across fifth dimension
auto Umu5d_v = Umu5d.View(); {
auto Umu_v = Umu.View(); autoView(Umu5d_v , Umu5d, CpuWrite);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){ autoView( Umu_v , Umu, CpuRead);
for(int s=0;s<Ls;s++){ for(int ss=0;ss<Umu.Grid()->oSites();ss++){
Umu5d_v[Ls*ss+s] = Umu_v[ss]; for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
} }
} }

Some files were not shown because too many files have changed in this diff Show More