mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 01:35:36 +00:00
merge upstream develop
This commit is contained in:
commit
8726e94ea7
@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/perfmon/PerfCount.h>
|
#include <Grid/perfmon/PerfCount.h>
|
||||||
#include <Grid/util/Util.h>
|
#include <Grid/util/Util.h>
|
||||||
#include <Grid/log/Log.h>
|
#include <Grid/log/Log.h>
|
||||||
#include <Grid/allocator/AlignedAllocator.h>
|
#include <Grid/allocator/Allocator.h>
|
||||||
#include <Grid/simd/Simd.h>
|
#include <Grid/simd/Simd.h>
|
||||||
#include <Grid/threads/Threads.h>
|
#include <Grid/threads/ThreadReduction.h>
|
||||||
#include <Grid/serialisation/Serialisation.h>
|
#include <Grid/serialisation/Serialisation.h>
|
||||||
#include <Grid/util/Sha.h>
|
#include <Grid/util/Sha.h>
|
||||||
#include <Grid/communicator/Communicator.h>
|
#include <Grid/communicator/Communicator.h>
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
///////////////////
|
///////////////////
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <complex>
|
#include <complex>
|
||||||
|
#include <memory>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -18,12 +18,23 @@
|
|||||||
#pragma push_macro("__CUDA_ARCH__")
|
#pragma push_macro("__CUDA_ARCH__")
|
||||||
#pragma push_macro("__NVCC__")
|
#pragma push_macro("__NVCC__")
|
||||||
#pragma push_macro("__CUDACC__")
|
#pragma push_macro("__CUDACC__")
|
||||||
|
#undef __CUDA_ARCH__
|
||||||
#undef __NVCC__
|
#undef __NVCC__
|
||||||
#undef __CUDACC__
|
#undef __CUDACC__
|
||||||
#undef __CUDA_ARCH__
|
|
||||||
#define __NVCC__REDEFINE__
|
#define __NVCC__REDEFINE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* SYCL save and restore compile environment*/
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
#pragma push
|
||||||
|
#pragma push_macro("__SYCL_DEVICE_ONLY__")
|
||||||
|
#undef __SYCL_DEVICE_ONLY__
|
||||||
|
#define EIGEN_DONT_VECTORIZE
|
||||||
|
//#undef EIGEN_USE_SYCL
|
||||||
|
#define __SYCL__REDEFINE__
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#include <Grid/Eigen/Dense>
|
#include <Grid/Eigen/Dense>
|
||||||
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
||||||
|
|
||||||
@ -31,7 +42,13 @@
|
|||||||
#ifdef __NVCC__REDEFINE__
|
#ifdef __NVCC__REDEFINE__
|
||||||
#pragma pop_macro("__CUDACC__")
|
#pragma pop_macro("__CUDACC__")
|
||||||
#pragma pop_macro("__NVCC__")
|
#pragma pop_macro("__NVCC__")
|
||||||
#pragma pop_macro("__CUDA_ARCH__")
|
#pragma pop_macro("GRID_SIMT")
|
||||||
|
#pragma pop
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*SYCL restore*/
|
||||||
|
#ifdef __SYCL__REDEFINE__
|
||||||
|
#pragma pop_macro("__SYCL_DEVICE_ONLY__")
|
||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -39,3 +56,4 @@
|
|||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ if BUILD_HDF5
|
|||||||
extra_headers+=serialisation/Hdf5Type.h
|
extra_headers+=serialisation/Hdf5Type.h
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all: version-cache
|
all: version-cache Version.h
|
||||||
|
|
||||||
version-cache:
|
version-cache:
|
||||||
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
|
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
|
||||||
@ -42,7 +42,7 @@ version-cache:
|
|||||||
fi;\
|
fi;\
|
||||||
rm -f vertmp
|
rm -f vertmp
|
||||||
|
|
||||||
Version.h:
|
Version.h: version-cache
|
||||||
cp version-cache Version.h
|
cp version-cache Version.h
|
||||||
|
|
||||||
.PHONY: version-cache
|
.PHONY: version-cache
|
||||||
|
@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALGORITHMS_H
|
#ifndef GRID_ALGORITHMS_H
|
||||||
#define GRID_ALGORITHMS_H
|
#define GRID_ALGORITHMS_H
|
||||||
|
|
||||||
|
NAMESPACE_CHECK(algorithms);
|
||||||
#include <Grid/algorithms/SparseMatrix.h>
|
#include <Grid/algorithms/SparseMatrix.h>
|
||||||
#include <Grid/algorithms/LinearOperator.h>
|
#include <Grid/algorithms/LinearOperator.h>
|
||||||
#include <Grid/algorithms/Preconditioner.h>
|
#include <Grid/algorithms/Preconditioner.h>
|
||||||
|
NAMESPACE_CHECK(SparseMatrix);
|
||||||
|
|
||||||
#include <Grid/algorithms/approx/Zolotarev.h>
|
#include <Grid/algorithms/approx/Zolotarev.h>
|
||||||
#include <Grid/algorithms/approx/Chebyshev.h>
|
#include <Grid/algorithms/approx/Chebyshev.h>
|
||||||
@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/algorithms/approx/Forecast.h>
|
#include <Grid/algorithms/approx/Forecast.h>
|
||||||
#include <Grid/algorithms/approx/RemezGeneral.h>
|
#include <Grid/algorithms/approx/RemezGeneral.h>
|
||||||
#include <Grid/algorithms/approx/ZMobius.h>
|
#include <Grid/algorithms/approx/ZMobius.h>
|
||||||
|
NAMESPACE_CHECK(approx);
|
||||||
#include <Grid/algorithms/iterative/Deflation.h>
|
#include <Grid/algorithms/iterative/Deflation.h>
|
||||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||||
|
NAMESPACE_CHECK(ConjGrad);
|
||||||
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
||||||
|
NAMESPACE_CHECK(BiCGSTAB);
|
||||||
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
||||||
#include <Grid/algorithms/iterative/NormalEquations.h>
|
#include <Grid/algorithms/iterative/NormalEquations.h>
|
||||||
#include <Grid/algorithms/iterative/SchurRedBlack.h>
|
#include <Grid/algorithms/iterative/SchurRedBlack.h>
|
||||||
@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||||
#include <Grid/algorithms/iterative/PowerMethod.h>
|
#include <Grid/algorithms/iterative/PowerMethod.h>
|
||||||
|
|
||||||
|
NAMESPACE_CHECK(PowerMethod);
|
||||||
#include <Grid/algorithms/CoarsenedMatrix.h>
|
#include <Grid/algorithms/CoarsenedMatrix.h>
|
||||||
|
NAMESPACE_CHECK(CoarsendMatrix);
|
||||||
#include <Grid/algorithms/FFT.h>
|
#include <Grid/algorithms/FFT.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,14 +1,3 @@
|
|||||||
// blockZaxpy in bockPromote - 3s, 5%
|
|
||||||
// noncoalesced linalg in Preconditionoer ~ 3s 5%
|
|
||||||
// Lancos tuning or replace 10-20s ~ 25%, open ended
|
|
||||||
// setup tuning 5s ~ 8%
|
|
||||||
// -- e.g. ordermin, orderstep tunables.
|
|
||||||
// MdagM path without norm in LinOp code. few seconds
|
|
||||||
|
|
||||||
// Mdir calc blocking kernels
|
|
||||||
// Fuse kernels in blockMaskedInnerProduct
|
|
||||||
// preallocate Vectors in Cayley 5D ~ few percent few seconds
|
|
||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -91,35 +80,8 @@ public:
|
|||||||
}
|
}
|
||||||
directions [2*_d]=0;
|
directions [2*_d]=0;
|
||||||
displacements[2*_d]=0;
|
displacements[2*_d]=0;
|
||||||
|
|
||||||
//// report back
|
|
||||||
std::cout<<GridLogMessage<<"directions :";
|
|
||||||
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
|
|
||||||
std::cout<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"displacements :";
|
|
||||||
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
|
|
||||||
std::cout<<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
// Original cleaner code
|
|
||||||
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
|
|
||||||
for(int d=0;d<dimension;d++){
|
|
||||||
directions[2*d ] = d;
|
|
||||||
directions[2*d+1] = d;
|
|
||||||
displacements[2*d ] = +1;
|
|
||||||
displacements[2*d+1] = -1;
|
|
||||||
}
|
|
||||||
directions [2*dimension]=0;
|
|
||||||
displacements[2*dimension]=0;
|
|
||||||
}
|
|
||||||
std::vector<int> GetDelta(int point) {
|
|
||||||
std::vector<int> delta(dimension,0);
|
|
||||||
delta[directions[point]] = displacements[point];
|
|
||||||
return delta;
|
|
||||||
};
|
|
||||||
*/
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Fobj,class CComplex,int nbasis>
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
@ -149,24 +111,6 @@ public:
|
|||||||
CoarseScalar InnerProd(CoarseGrid);
|
CoarseScalar InnerProd(CoarseGrid);
|
||||||
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
|
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
|
||||||
blockOrthogonalise(InnerProd,subspace);
|
blockOrthogonalise(InnerProd,subspace);
|
||||||
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 2"<<std::endl; // Really have to do twice? Yuck
|
|
||||||
// blockOrthogonalise(InnerProd,subspace);
|
|
||||||
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
|
|
||||||
// CheckOrthogonal();
|
|
||||||
}
|
|
||||||
void CheckOrthogonal(void){
|
|
||||||
CoarseVector iProj(CoarseGrid);
|
|
||||||
CoarseVector eProj(CoarseGrid);
|
|
||||||
for(int i=0;i<nbasis;i++){
|
|
||||||
blockProject(iProj,subspace[i],subspace);
|
|
||||||
eProj=Zero();
|
|
||||||
accelerator_for(ss, CoarseGrid->oSites(),1,{
|
|
||||||
eProj[ss](i)=CComplex(1.0);
|
|
||||||
});
|
|
||||||
eProj=eProj - iProj;
|
|
||||||
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
|
|
||||||
}
|
|
||||||
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
|
|
||||||
}
|
}
|
||||||
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
|
||||||
blockProject(CoarseVec,FineVec,subspace);
|
blockProject(CoarseVec,FineVec,subspace);
|
||||||
@ -175,11 +119,6 @@ public:
|
|||||||
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
FineVec.Checkerboard() = subspace[0].Checkerboard();
|
||||||
blockPromote(CoarseVec,FineVec,subspace);
|
blockPromote(CoarseVec,FineVec,subspace);
|
||||||
}
|
}
|
||||||
void CreateSubspaceRandom(GridParallelRNG &RNG){
|
|
||||||
for(int i=0;i<nbasis;i++){
|
|
||||||
random(RNG,subspace[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
||||||
|
|
||||||
@ -218,7 +157,7 @@ public:
|
|||||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||||
// and this is the best I found
|
// and this is the best I found
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#if 1
|
|
||||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
int nn,
|
int nn,
|
||||||
double hi,
|
double hi,
|
||||||
@ -280,10 +219,10 @@ public:
|
|||||||
|
|
||||||
hermop.HermOp(*Tn,y);
|
hermop.HermOp(*Tn,y);
|
||||||
|
|
||||||
auto y_v = y.View();
|
autoView( y_v , y, AcceleratorWrite);
|
||||||
auto Tn_v = Tn->View();
|
autoView( Tn_v , (*Tn), AcceleratorWrite);
|
||||||
auto Tnp_v = Tnp->View();
|
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
|
||||||
auto Tnm_v = Tnm->View();
|
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
||||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
@ -313,201 +252,6 @@ public:
|
|||||||
}
|
}
|
||||||
assert(b==nn);
|
assert(b==nn);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#if 0
|
|
||||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
|
||||||
int nn,
|
|
||||||
double hi,
|
|
||||||
double lo,
|
|
||||||
int orderfilter,
|
|
||||||
int ordermin,
|
|
||||||
int orderstep,
|
|
||||||
double filterlo
|
|
||||||
) {
|
|
||||||
|
|
||||||
RealD scale;
|
|
||||||
|
|
||||||
FineField noise(FineGrid);
|
|
||||||
FineField Mn(FineGrid);
|
|
||||||
FineField tmp(FineGrid);
|
|
||||||
FineField combined(FineGrid);
|
|
||||||
|
|
||||||
// New normalised noise
|
|
||||||
gaussian(RNG,noise);
|
|
||||||
scale = std::pow(norm2(noise),-0.5);
|
|
||||||
noise=noise*scale;
|
|
||||||
|
|
||||||
// Initial matrix element
|
|
||||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
|
||||||
|
|
||||||
int b =0;
|
|
||||||
#define FILTERb(llo,hhi,oorder) \
|
|
||||||
{ \
|
|
||||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
|
||||||
Cheb(hermop,noise,Mn); \
|
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
|
||||||
subspace[b] = Mn; \
|
|
||||||
hermop.Op(Mn,tmp); \
|
|
||||||
std::cout<<GridLogMessage << oorder<< " Cheb filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
|
||||||
b++; \
|
|
||||||
}
|
|
||||||
|
|
||||||
// JacobiPolynomial<FineField> Cheb(0.002,60.0,1500,-0.5,3.5); \
|
|
||||||
|
|
||||||
RealD alpha=-0.8;
|
|
||||||
RealD beta =-0.8;
|
|
||||||
#define FILTER(llo,hhi,oorder) \
|
|
||||||
{ \
|
|
||||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
|
||||||
/* JacobiPolynomial<FineField> Cheb(0.0,60.0,oorder,alpha,beta);*/\
|
|
||||||
Cheb(hermop,noise,Mn); \
|
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; \
|
|
||||||
subspace[b] = Mn; \
|
|
||||||
hermop.Op(Mn,tmp); \
|
|
||||||
std::cout<<GridLogMessage << oorder<< "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
|
||||||
b++; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define FILTERc(llo,hhi,oorder) \
|
|
||||||
{ \
|
|
||||||
Chebyshev<FineField> Cheb(llo,hhi,oorder); \
|
|
||||||
Cheb(hermop,noise,combined); \
|
|
||||||
}
|
|
||||||
|
|
||||||
double node = 0.000;
|
|
||||||
FILTERb(lo,hi,orderfilter);// 0
|
|
||||||
// FILTERc(node,hi,51);// 0
|
|
||||||
noise = Mn;
|
|
||||||
int base = 0;
|
|
||||||
int mult = 100;
|
|
||||||
FILTER(node,hi,base+1*mult);
|
|
||||||
FILTER(node,hi,base+2*mult);
|
|
||||||
FILTER(node,hi,base+3*mult);
|
|
||||||
FILTER(node,hi,base+4*mult);
|
|
||||||
FILTER(node,hi,base+5*mult);
|
|
||||||
FILTER(node,hi,base+6*mult);
|
|
||||||
FILTER(node,hi,base+7*mult);
|
|
||||||
FILTER(node,hi,base+8*mult);
|
|
||||||
FILTER(node,hi,base+9*mult);
|
|
||||||
FILTER(node,hi,base+10*mult);
|
|
||||||
FILTER(node,hi,base+11*mult);
|
|
||||||
FILTER(node,hi,base+12*mult);
|
|
||||||
FILTER(node,hi,base+13*mult);
|
|
||||||
FILTER(node,hi,base+14*mult);
|
|
||||||
FILTER(node,hi,base+15*mult);
|
|
||||||
assert(b==nn);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
|
||||||
int nn,
|
|
||||||
double hi,
|
|
||||||
double lo,
|
|
||||||
int orderfilter,
|
|
||||||
int ordermin,
|
|
||||||
int orderstep,
|
|
||||||
double filterlo
|
|
||||||
) {
|
|
||||||
|
|
||||||
RealD scale;
|
|
||||||
|
|
||||||
FineField noise(FineGrid);
|
|
||||||
FineField Mn(FineGrid);
|
|
||||||
FineField tmp(FineGrid);
|
|
||||||
FineField combined(FineGrid);
|
|
||||||
|
|
||||||
// New normalised noise
|
|
||||||
gaussian(RNG,noise);
|
|
||||||
scale = std::pow(norm2(noise),-0.5);
|
|
||||||
noise=noise*scale;
|
|
||||||
|
|
||||||
// Initial matrix element
|
|
||||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
|
||||||
|
|
||||||
int b =0;
|
|
||||||
{
|
|
||||||
Chebyshev<FineField> JacobiPoly(0.005,60.,1500);
|
|
||||||
// JacobiPolynomial<FineField> JacobiPoly(0.002,60.0,1500,-0.5,3.5);
|
|
||||||
//JacobiPolynomial<FineField> JacobiPoly(0.03,60.0,500,-0.5,3.5);
|
|
||||||
// JacobiPolynomial<FineField> JacobiPoly(0.00,60.0,1000,-0.5,3.5);
|
|
||||||
JacobiPoly(hermop,noise,Mn);
|
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
|
||||||
subspace[b] = Mn;
|
|
||||||
hermop.Op(Mn,tmp);
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
|
||||||
b++;
|
|
||||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
|
||||||
// subspace[b] = tmp; b++;
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
#define FILTER(lambda) \
|
|
||||||
{ \
|
|
||||||
hermop.HermOp(subspace[0],tmp); \
|
|
||||||
tmp = tmp - lambda *subspace[0]; \
|
|
||||||
scale = std::pow(norm2(tmp),-0.5); \
|
|
||||||
tmp=tmp*scale; \
|
|
||||||
subspace[b] = tmp; \
|
|
||||||
hermop.Op(subspace[b],tmp); \
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; \
|
|
||||||
b++; \
|
|
||||||
}
|
|
||||||
// scale = std::pow(norm2(tmp),-0.5); tmp=tmp*scale;
|
|
||||||
// subspace[b] = tmp; b++;
|
|
||||||
// }
|
|
||||||
|
|
||||||
FILTER(2.0e-5);
|
|
||||||
FILTER(2.0e-4);
|
|
||||||
FILTER(4.0e-4);
|
|
||||||
FILTER(8.0e-4);
|
|
||||||
FILTER(8.0e-4);
|
|
||||||
|
|
||||||
FILTER(2.0e-3);
|
|
||||||
FILTER(3.0e-3);
|
|
||||||
FILTER(4.0e-3);
|
|
||||||
FILTER(5.0e-3);
|
|
||||||
FILTER(6.0e-3);
|
|
||||||
|
|
||||||
FILTER(2.5e-3);
|
|
||||||
FILTER(3.5e-3);
|
|
||||||
FILTER(4.5e-3);
|
|
||||||
FILTER(5.5e-3);
|
|
||||||
FILTER(6.5e-3);
|
|
||||||
|
|
||||||
// FILTER(6.0e-5);//6
|
|
||||||
// FILTER(7.0e-5);//8
|
|
||||||
// FILTER(8.0e-5);//9
|
|
||||||
// FILTER(9.0e-5);//3
|
|
||||||
|
|
||||||
/*
|
|
||||||
// FILTER(1.0e-4);//10
|
|
||||||
FILTER(2.0e-4);//11
|
|
||||||
// FILTER(3.0e-4);//12
|
|
||||||
// FILTER(4.0e-4);//13
|
|
||||||
FILTER(5.0e-4);//14
|
|
||||||
|
|
||||||
FILTER(6.0e-3);//4
|
|
||||||
FILTER(7.0e-4);//1
|
|
||||||
FILTER(8.0e-4);//7
|
|
||||||
FILTER(9.0e-4);//15
|
|
||||||
FILTER(1.0e-3);//2
|
|
||||||
|
|
||||||
FILTER(2.0e-3);//2
|
|
||||||
FILTER(3.0e-3);//2
|
|
||||||
FILTER(4.0e-3);//2
|
|
||||||
FILTER(5.0e-3);//2
|
|
||||||
FILTER(6.0e-3);//2
|
|
||||||
|
|
||||||
FILTER(7.0e-3);//2
|
|
||||||
FILTER(8.0e-3);//2
|
|
||||||
FILTER(1.0e-2);//2
|
|
||||||
*/
|
|
||||||
std::cout << GridLogMessage <<"Jacobi filtering done" <<std::endl;
|
|
||||||
assert(b==nn);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -541,36 +285,28 @@ public:
|
|||||||
///////////////////////
|
///////////////////////
|
||||||
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
||||||
|
|
||||||
RealD M (const CoarseVector &in, CoarseVector &out){
|
void M (const CoarseVector &in, CoarseVector &out)
|
||||||
|
{
|
||||||
conformable(_grid,in.Grid());
|
conformable(_grid,in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
|
|
||||||
// RealD Nin = norm2(in);
|
|
||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
double comms_usec = -usecond();
|
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
comms_usec += usecond();
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
auto in_v = in.View();
|
|
||||||
auto out_v = out.View();
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
GridStopWatch ArithmeticTimer;
|
|
||||||
int osites=Grid()->oSites();
|
int osites=Grid()->oSites();
|
||||||
// double flops = osites*Nsimd*nbasis*nbasis*8.0*geom.npoint;
|
|
||||||
// double bytes = osites*nbasis*nbasis*geom.npoint*sizeof(CComplex);
|
|
||||||
double usecs =-usecond();
|
|
||||||
// assert(geom.npoint==9);
|
|
||||||
|
|
||||||
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||||
int ss = sss/nbasis;
|
int ss = sss/nbasis;
|
||||||
@ -580,41 +316,28 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
int lane=SIMTlane(Nsimd);
|
|
||||||
for(int point=0;point<geom.npoint;point++){
|
for(int point=0;point<geom.npoint;point++){
|
||||||
|
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
SE=Stencil.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
||||||
}
|
}
|
||||||
synchronise();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
for(int bb=0;bb<nbasis;bb++) {
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res,lane);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
usecs +=usecond();
|
|
||||||
|
|
||||||
double nrm_usec=-usecond();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
RealD Nout= norm2(out);
|
|
||||||
nrm_usec+=usecond();
|
|
||||||
|
|
||||||
/*
|
|
||||||
std::cout << GridLogMessage << "\tNorm " << nrm_usec << " us" <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tHalo " << comms_usec << " us" <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tMatrix " << usecs << " us" <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\t mflop/s " << flops/usecs<<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\t MB/s " << bytes/usecs<<std::endl;
|
|
||||||
*/
|
|
||||||
return Nout;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
RealD Mdag (const CoarseVector &in, CoarseVector &out)
|
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
if(hermitian) {
|
if(hermitian) {
|
||||||
// corresponds to Petrov-Galerkin coarsening
|
// corresponds to Petrov-Galerkin coarsening
|
||||||
@ -625,7 +348,6 @@ public:
|
|||||||
G5C(tmp, in);
|
G5C(tmp, in);
|
||||||
M(tmp, out);
|
M(tmp, out);
|
||||||
G5C(out, out);
|
G5C(out, out);
|
||||||
return norm2(out);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
void MdirComms(const CoarseVector &in)
|
void MdirComms(const CoarseVector &in)
|
||||||
@ -640,11 +362,11 @@ public:
|
|||||||
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View());
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
auto out_v = out.View();
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
auto in_v = in.View();
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
@ -658,45 +380,21 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
int lane=SIMTlane(Nsimd);
|
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
SE=Stencil.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute,lane);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset],lane);
|
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
||||||
}
|
}
|
||||||
synchronise();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
for(int bb=0;bb<nbasis;bb++) {
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res,lane);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
#if 0
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
accelerator_for(ss,Grid()->oSites(),1,{
|
|
||||||
|
|
||||||
siteVector res = Zero();
|
|
||||||
siteVector nbr;
|
|
||||||
int ptype;
|
|
||||||
StencilEntry *SE;
|
|
||||||
|
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
|
||||||
|
|
||||||
if(SE->_is_local&&SE->_permute) {
|
|
||||||
permute(nbr,in_v[SE->_offset],ptype);
|
|
||||||
} else if(SE->_is_local) {
|
|
||||||
nbr = in_v[SE->_offset];
|
|
||||||
} else {
|
|
||||||
nbr = Stencil.CommBuf()[SE->_offset];
|
|
||||||
}
|
|
||||||
synchronise();
|
|
||||||
|
|
||||||
res = res + Aview_p[point][ss]*nbr;
|
|
||||||
|
|
||||||
out_v[ss]=res;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||||
{
|
{
|
||||||
@ -864,14 +562,12 @@ public:
|
|||||||
|
|
||||||
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
|
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
|
||||||
|
|
||||||
auto iZProj_v = iZProj.View() ;
|
autoView( iZProj_v , iZProj, AcceleratorRead) ;
|
||||||
auto oZProj_v = oZProj.View() ;
|
autoView( oZProj_v , oZProj, AcceleratorRead) ;
|
||||||
auto A_p = A[p].View();
|
autoView( A_p , A[p], AcceleratorWrite);
|
||||||
auto A_self = A[self_stencil].View();
|
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||||
// if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
|
|
||||||
// accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_self[ss](j,i),A_self(ss)(j,i)+iZProj_v(ss)); });
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -885,11 +581,11 @@ public:
|
|||||||
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
||||||
|
|
||||||
{
|
{
|
||||||
auto tmp_ = tmp.View();
|
autoView( tmp_ , tmp, AcceleratorWrite);
|
||||||
auto evenmask_ = evenmask.View();
|
autoView( evenmask_ , evenmask, AcceleratorRead);
|
||||||
auto oddmask_ = oddmask.View();
|
autoView( oddmask_ , oddmask, AcceleratorRead);
|
||||||
auto Mphie_ = Mphie.View();
|
autoView( Mphie_ , Mphie, AcceleratorRead);
|
||||||
auto Mphio_ = Mphio.View();
|
autoView( Mphio_ , Mphio, AcceleratorRead);
|
||||||
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
||||||
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
||||||
});
|
});
|
||||||
@ -897,8 +593,8 @@ public:
|
|||||||
|
|
||||||
blockProject(SelfProj,tmp,Subspace.subspace);
|
blockProject(SelfProj,tmp,Subspace.subspace);
|
||||||
|
|
||||||
auto SelfProj_ = SelfProj.View();
|
autoView( SelfProj_ , SelfProj, AcceleratorRead);
|
||||||
auto A_self = A[self_stencil].View();
|
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
||||||
for(int j=0;j<nbasis;j++){
|
for(int j=0;j<nbasis;j++){
|
||||||
@ -912,33 +608,8 @@ public:
|
|||||||
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
||||||
ForceHermitian();
|
ForceHermitian();
|
||||||
}
|
}
|
||||||
// AssertHermitian();
|
|
||||||
// ForceDiagonal();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
///////////////////////////
|
|
||||||
// test code worth preserving in if block
|
|
||||||
///////////////////////////
|
|
||||||
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
|
||||||
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
|
|
||||||
std::cout<<GridLogMessage<< A[p] << std::endl;
|
|
||||||
}
|
|
||||||
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
|
|
||||||
|
|
||||||
phi=Subspace.subspace[0];
|
|
||||||
std::vector<int> bc(FineGrid->_ndimension,0);
|
|
||||||
|
|
||||||
blockPick(Grid(),phi,tmp,bc); // Pick out a block
|
|
||||||
linop.Op(tmp,Mphi); // Apply big dop
|
|
||||||
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
|
|
||||||
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<< iProj <<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
void ForceHermitian(void) {
|
void ForceHermitian(void) {
|
||||||
CoarseMatrix Diff (Grid());
|
CoarseMatrix Diff (Grid());
|
||||||
for(int p=0;p<geom.npoint;p++){
|
for(int p=0;p<geom.npoint;p++){
|
||||||
@ -958,27 +629,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void AssertHermitian(void) {
|
|
||||||
CoarseMatrix AA (Grid());
|
|
||||||
CoarseMatrix AAc (Grid());
|
|
||||||
CoarseMatrix Diff (Grid());
|
|
||||||
for(int d=0;d<4;d++){
|
|
||||||
|
|
||||||
int dd=d+1;
|
|
||||||
AAc = Cshift(A[2*d+1],dd,1);
|
|
||||||
AA = A[2*d];
|
|
||||||
|
|
||||||
Diff = AA - adj(AAc);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
Diff = A[8] - adj(A[8]);
|
|
||||||
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -37,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class scalar> struct FFTW { };
|
template<class scalar> struct FFTW { };
|
||||||
@ -191,7 +189,7 @@ public:
|
|||||||
typedef typename sobj::scalar_type scalar;
|
typedef typename sobj::scalar_type scalar;
|
||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
auto pgbuf_v = pgbuf.View();
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
@ -232,15 +230,18 @@ public:
|
|||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
for(int p=0;p<processors[dim];p++) {
|
for(int p=0;p<processors[dim];p++) {
|
||||||
|
{
|
||||||
|
autoView(r_v,result,CpuRead);
|
||||||
|
autoView(p_v,pgbuf,CpuWrite);
|
||||||
thread_for(idx, sgrid->lSites(),{
|
thread_for(idx, sgrid->lSites(),{
|
||||||
Coordinate cbuf(Nd);
|
Coordinate cbuf(Nd);
|
||||||
sobj s;
|
sobj s;
|
||||||
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
||||||
peekLocalSite(s,result,cbuf);
|
peekLocalSite(s,r_v,cbuf);
|
||||||
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
||||||
// cbuf[dim]+=p*L;
|
pokeLocalSite(s,p_v,cbuf);
|
||||||
pokeLocalSite(s,pgbuf,cbuf);
|
|
||||||
});
|
});
|
||||||
|
}
|
||||||
if (p != processors[dim] - 1) {
|
if (p != processors[dim] - 1) {
|
||||||
result = Cshift(result,dim,L);
|
result = Cshift(result,dim,L);
|
||||||
}
|
}
|
||||||
@ -269,15 +270,19 @@ public:
|
|||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
// writing out result
|
// writing out result
|
||||||
|
{
|
||||||
|
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||||
|
autoView(result_v,result,CpuWrite);
|
||||||
thread_for(idx,sgrid->lSites(),{
|
thread_for(idx,sgrid->lSites(),{
|
||||||
Coordinate clbuf(Nd), cgbuf(Nd);
|
Coordinate clbuf(Nd), cgbuf(Nd);
|
||||||
sobj s;
|
sobj s;
|
||||||
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
||||||
cgbuf = clbuf;
|
cgbuf = clbuf;
|
||||||
cgbuf[dim] = clbuf[dim]+L*pc;
|
cgbuf[dim] = clbuf[dim]+L*pc;
|
||||||
peekLocalSite(s,pgbuf,cgbuf);
|
peekLocalSite(s,pgbuf_v,cgbuf);
|
||||||
pokeLocalSite(s,result,clbuf);
|
pokeLocalSite(s,result_v,clbuf);
|
||||||
});
|
});
|
||||||
|
}
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
// destroying plan
|
// destroying plan
|
||||||
|
@ -43,7 +43,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class Field> class LinearOperatorBase {
|
template<class Field> class LinearOperatorBase {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// Support for coarsening to a multigrid
|
// Support for coarsening to a multigrid
|
||||||
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
|
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
|
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
|
||||||
@ -94,7 +93,10 @@ public:
|
|||||||
_Mat.Mdag(in,out);
|
_Mat.Mdag(in,out);
|
||||||
}
|
}
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
_Mat.MdagM(in,out,n1,n2);
|
_Mat.MdagM(in,out);
|
||||||
|
ComplexD dot = innerProduct(in,out);
|
||||||
|
n1=real(dot);
|
||||||
|
n2=norm2(out);
|
||||||
}
|
}
|
||||||
void HermOp(const Field &in, Field &out){
|
void HermOp(const Field &in, Field &out){
|
||||||
_Mat.MdagM(in,out);
|
_Mat.MdagM(in,out);
|
||||||
@ -131,17 +133,14 @@ public:
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
_Mat.MdagM(in,out,n1,n2);
|
HermOp(in,out);
|
||||||
out = out + _shift*in;
|
ComplexD dot = innerProduct(in,out);
|
||||||
|
|
||||||
ComplexD dot;
|
|
||||||
dot= innerProduct(in,out);
|
|
||||||
n1=real(dot);
|
n1=real(dot);
|
||||||
n2=norm2(out);
|
n2=norm2(out);
|
||||||
}
|
}
|
||||||
void HermOp(const Field &in, Field &out){
|
void HermOp(const Field &in, Field &out){
|
||||||
RealD n1,n2;
|
_Mat.MdagM(in,out);
|
||||||
HermOpAndNorm(in,out,n1,n2);
|
out = out + _shift*in;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -170,7 +169,7 @@ public:
|
|||||||
_Mat.M(in,out);
|
_Mat.M(in,out);
|
||||||
}
|
}
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
_Mat.M(in,out);
|
HermOp(in,out);
|
||||||
ComplexD dot= innerProduct(in,out); n1=real(dot);
|
ComplexD dot= innerProduct(in,out); n1=real(dot);
|
||||||
n2=norm2(out);
|
n2=norm2(out);
|
||||||
}
|
}
|
||||||
@ -216,21 +215,24 @@ public:
|
|||||||
template<class Field>
|
template<class Field>
|
||||||
class SchurOperatorBase : public LinearOperatorBase<Field> {
|
class SchurOperatorBase : public LinearOperatorBase<Field> {
|
||||||
public:
|
public:
|
||||||
virtual RealD Mpc (const Field &in, Field &out) =0;
|
virtual void Mpc (const Field &in, Field &out) =0;
|
||||||
virtual RealD MpcDag (const Field &in, Field &out) =0;
|
virtual void MpcDag (const Field &in, Field &out) =0;
|
||||||
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
|
virtual void MpcDagMpc(const Field &in, Field &out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
tmp.Checkerboard() = in.Checkerboard();
|
tmp.Checkerboard() = in.Checkerboard();
|
||||||
ni=Mpc(in,tmp);
|
Mpc(in,tmp);
|
||||||
no=MpcDag(tmp,out);
|
MpcDag(tmp,out);
|
||||||
}
|
}
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
MpcDagMpc(in,out,n1,n2);
|
MpcDagMpc(in,out);
|
||||||
|
ComplexD dot= innerProduct(in,out);
|
||||||
|
n1=real(dot);
|
||||||
|
n2=norm2(out);
|
||||||
}
|
}
|
||||||
virtual void HermOp(const Field &in, Field &out){
|
virtual void HermOp(const Field &in, Field &out){
|
||||||
RealD n1,n2;
|
out.Checkerboard() = in.Checkerboard();
|
||||||
HermOpAndNorm(in,out,n1,n2);
|
MpcDagMpc(in,out);
|
||||||
}
|
}
|
||||||
void Op (const Field &in, Field &out){
|
void Op (const Field &in, Field &out){
|
||||||
Mpc(in,out);
|
Mpc(in,out);
|
||||||
@ -254,28 +256,24 @@ public:
|
|||||||
public:
|
public:
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
|
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
tmp.Checkerboard() = !in.Checkerboard();
|
tmp.Checkerboard() = !in.Checkerboard();
|
||||||
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
|
|
||||||
|
|
||||||
_Mat.Meooe(in,tmp);
|
_Mat.Meooe(in,tmp);
|
||||||
_Mat.MooeeInv(tmp,out);
|
_Mat.MooeeInv(tmp,out);
|
||||||
_Mat.Meooe(out,tmp);
|
_Mat.Meooe(out,tmp);
|
||||||
|
|
||||||
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
|
|
||||||
_Mat.Mooee(in,out);
|
_Mat.Mooee(in,out);
|
||||||
return axpy_norm(out,-1.0,tmp,out);
|
axpy(out,-1.0,tmp,out);
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag (const Field &in, Field &out){
|
virtual void MpcDag (const Field &in, Field &out){
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MeooeDag(in,tmp);
|
_Mat.MeooeDag(in,tmp);
|
||||||
_Mat.MooeeInvDag(tmp,out);
|
_Mat.MooeeInvDag(tmp,out);
|
||||||
_Mat.MeooeDag(out,tmp);
|
_Mat.MeooeDag(out,tmp);
|
||||||
|
|
||||||
_Mat.MooeeDag(in,out);
|
_Mat.MooeeDag(in,out);
|
||||||
return axpy_norm(out,-1.0,tmp,out);
|
axpy(out,-1.0,tmp,out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class Matrix,class Field>
|
template<class Matrix,class Field>
|
||||||
@ -285,25 +283,23 @@ public:
|
|||||||
public:
|
public:
|
||||||
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
|
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
|
|
||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.Meooe(in,out);
|
_Mat.Meooe(in,out);
|
||||||
_Mat.MooeeInv(out,tmp);
|
_Mat.MooeeInv(out,tmp);
|
||||||
_Mat.Meooe(tmp,out);
|
_Mat.Meooe(tmp,out);
|
||||||
_Mat.MooeeInv(out,tmp);
|
_Mat.MooeeInv(out,tmp);
|
||||||
|
axpy(out,-1.0,tmp,in);
|
||||||
return axpy_norm(out,-1.0,tmp,in);
|
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag (const Field &in, Field &out){
|
virtual void MpcDag (const Field &in, Field &out){
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MooeeInvDag(in,out);
|
_Mat.MooeeInvDag(in,out);
|
||||||
_Mat.MeooeDag(out,tmp);
|
_Mat.MeooeDag(out,tmp);
|
||||||
_Mat.MooeeInvDag(tmp,out);
|
_Mat.MooeeInvDag(tmp,out);
|
||||||
_Mat.MeooeDag(out,tmp);
|
_Mat.MeooeDag(out,tmp);
|
||||||
|
axpy(out,-1.0,tmp,in);
|
||||||
return axpy_norm(out,-1.0,tmp,in);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class Matrix,class Field>
|
template<class Matrix,class Field>
|
||||||
@ -313,7 +309,7 @@ public:
|
|||||||
public:
|
public:
|
||||||
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
|
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
|
|
||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MooeeInv(in,out);
|
_Mat.MooeeInv(in,out);
|
||||||
@ -321,9 +317,9 @@ public:
|
|||||||
_Mat.MooeeInv(tmp,out);
|
_Mat.MooeeInv(tmp,out);
|
||||||
_Mat.Meooe(out,tmp);
|
_Mat.Meooe(out,tmp);
|
||||||
|
|
||||||
return axpy_norm(out,-1.0,tmp,in);
|
axpy(out,-1.0,tmp,in);
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag (const Field &in, Field &out){
|
virtual void MpcDag (const Field &in, Field &out){
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MeooeDag(in,out);
|
_Mat.MeooeDag(in,out);
|
||||||
@ -331,7 +327,7 @@ public:
|
|||||||
_Mat.MeooeDag(tmp,out);
|
_Mat.MeooeDag(tmp,out);
|
||||||
_Mat.MooeeInvDag(out,tmp);
|
_Mat.MooeeInvDag(out,tmp);
|
||||||
|
|
||||||
return axpy_norm(out,-1.0,tmp,in);
|
axpy(out,-1.0,tmp,in);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -339,13 +335,13 @@ public:
|
|||||||
class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
|
class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
virtual RealD Mpc (const Field& in, Field& out) = 0;
|
virtual void Mpc (const Field& in, Field& out) = 0;
|
||||||
virtual RealD MpcDag (const Field& in, Field& out) = 0;
|
virtual void MpcDag (const Field& in, Field& out) = 0;
|
||||||
virtual void MpcDagMpc(const Field& in, Field& out, RealD& ni, RealD& no) {
|
virtual void MpcDagMpc(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
tmp.Checkerboard() = in.Checkerboard();
|
tmp.Checkerboard() = in.Checkerboard();
|
||||||
ni = Mpc(in,tmp);
|
Mpc(in,tmp);
|
||||||
no = MpcDag(tmp,out);
|
MpcDag(tmp,out);
|
||||||
}
|
}
|
||||||
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
|
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -366,6 +362,9 @@ public:
|
|||||||
void OpDir(const Field& in, Field& out, int dir, int disp) {
|
void OpDir(const Field& in, Field& out, int dir, int disp) {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
void OpDirAll(const Field& in, std::vector<Field>& out){
|
||||||
|
assert(0);
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Matrix, class Field>
|
template<class Matrix, class Field>
|
||||||
@ -374,7 +373,7 @@ public:
|
|||||||
public:
|
public:
|
||||||
Matrix& _Mat;
|
Matrix& _Mat;
|
||||||
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
|
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
|
||||||
virtual RealD Mpc(const Field& in, Field& out) {
|
virtual void Mpc(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
tmp.Checkerboard() = !in.Checkerboard();
|
tmp.Checkerboard() = !in.Checkerboard();
|
||||||
|
|
||||||
@ -384,9 +383,9 @@ public:
|
|||||||
|
|
||||||
_Mat.Mooee(in, out);
|
_Mat.Mooee(in, out);
|
||||||
|
|
||||||
return axpy_norm(out, -1.0, tmp, out);
|
axpy(out, -1.0, tmp, out);
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag(const Field& in, Field& out) {
|
virtual void MpcDag(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MeooeDag(in, tmp);
|
_Mat.MeooeDag(in, tmp);
|
||||||
@ -395,7 +394,7 @@ public:
|
|||||||
|
|
||||||
_Mat.MooeeDag(in, out);
|
_Mat.MooeeDag(in, out);
|
||||||
|
|
||||||
return axpy_norm(out, -1.0, tmp, out);
|
axpy(out, -1.0, tmp, out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -407,7 +406,7 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
|
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
|
||||||
virtual RealD Mpc(const Field& in, Field& out) {
|
virtual void Mpc(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.Meooe(in, out);
|
_Mat.Meooe(in, out);
|
||||||
@ -415,9 +414,9 @@ public:
|
|||||||
_Mat.Meooe(tmp, out);
|
_Mat.Meooe(tmp, out);
|
||||||
_Mat.MooeeInv(out, tmp);
|
_Mat.MooeeInv(out, tmp);
|
||||||
|
|
||||||
return axpy_norm(out, -1.0, tmp, in);
|
axpy(out, -1.0, tmp, in);
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag(const Field& in, Field& out) {
|
virtual void MpcDag(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MooeeInvDag(in, out);
|
_Mat.MooeeInvDag(in, out);
|
||||||
@ -425,7 +424,7 @@ public:
|
|||||||
_Mat.MooeeInvDag(tmp, out);
|
_Mat.MooeeInvDag(tmp, out);
|
||||||
_Mat.MeooeDag(out, tmp);
|
_Mat.MeooeDag(out, tmp);
|
||||||
|
|
||||||
return axpy_norm(out, -1.0, tmp, in);
|
axpy(out, -1.0, tmp, in);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -438,7 +437,7 @@ public:
|
|||||||
public:
|
public:
|
||||||
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
|
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
|
||||||
|
|
||||||
virtual RealD Mpc(const Field& in, Field& out) {
|
virtual void Mpc(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MooeeInv(in, out);
|
_Mat.MooeeInv(in, out);
|
||||||
@ -446,9 +445,9 @@ public:
|
|||||||
_Mat.MooeeInv(tmp, out);
|
_Mat.MooeeInv(tmp, out);
|
||||||
_Mat.Meooe(out, tmp);
|
_Mat.Meooe(out, tmp);
|
||||||
|
|
||||||
return axpy_norm(out, -1.0, tmp, in);
|
axpy(out, -1.0, tmp, in);
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag(const Field& in, Field& out) {
|
virtual void MpcDag(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
_Mat.MeooeDag(in, out);
|
_Mat.MeooeDag(in, out);
|
||||||
@ -456,7 +455,7 @@ public:
|
|||||||
_Mat.MeooeDag(tmp, out);
|
_Mat.MeooeDag(tmp, out);
|
||||||
_Mat.MooeeInvDag(out, tmp);
|
_Mat.MooeeInvDag(out, tmp);
|
||||||
|
|
||||||
return axpy_norm(out, -1.0, tmp, in);
|
axpy(out, -1.0, tmp, in);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -475,71 +474,38 @@ public:
|
|||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
Field tmp;
|
Field tmp;
|
||||||
RealD mass;
|
RealD mass;
|
||||||
double tMpc;
|
|
||||||
double tIP;
|
|
||||||
double tMeo;
|
|
||||||
double taxpby_norm;
|
|
||||||
uint64_t ncall;
|
|
||||||
public:
|
public:
|
||||||
void Report(void)
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
|
|
||||||
}
|
|
||||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
||||||
{
|
{
|
||||||
assert( _Mat.isTrivialEE() );
|
assert( _Mat.isTrivialEE() );
|
||||||
mass = _Mat.Mass();
|
mass = _Mat.Mass();
|
||||||
tMpc=0;
|
|
||||||
tIP =0;
|
|
||||||
tMeo=0;
|
|
||||||
taxpby_norm=0;
|
|
||||||
ncall=0;
|
|
||||||
}
|
}
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
ncall++;
|
Mpc(in,out);
|
||||||
tMpc-=usecond();
|
|
||||||
n2 = Mpc(in,out);
|
|
||||||
tMpc+=usecond();
|
|
||||||
tIP-=usecond();
|
|
||||||
ComplexD dot= innerProduct(in,out);
|
ComplexD dot= innerProduct(in,out);
|
||||||
tIP+=usecond();
|
|
||||||
n1 = real(dot);
|
n1 = real(dot);
|
||||||
|
n2 =0.0;
|
||||||
}
|
}
|
||||||
virtual void HermOp(const Field &in, Field &out){
|
virtual void HermOp(const Field &in, Field &out){
|
||||||
ncall++;
|
Mpc(in,out);
|
||||||
tMpc-=usecond();
|
// _Mat.Meooe(in,out);
|
||||||
_Mat.Meooe(in,out);
|
// _Mat.Meooe(out,tmp);
|
||||||
_Mat.Meooe(out,tmp);
|
// axpby(out,-1.0,mass*mass,tmp,in);
|
||||||
tMpc+=usecond();
|
|
||||||
taxpby_norm-=usecond();
|
|
||||||
axpby(out,-1.0,mass*mass,tmp,in);
|
|
||||||
taxpby_norm+=usecond();
|
|
||||||
}
|
}
|
||||||
virtual RealD Mpc (const Field &in, Field &out)
|
virtual void Mpc (const Field &in, Field &out)
|
||||||
{
|
{
|
||||||
|
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
Field tmp2(in.Grid());
|
Field tmp2(in.Grid());
|
||||||
|
|
||||||
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
|
// _Mat.Mooee(in,out);
|
||||||
_Mat.Mooee(in,out);
|
// _Mat.Mooee(out,tmp);
|
||||||
_Mat.Mooee(out,tmp);
|
|
||||||
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
|
|
||||||
|
|
||||||
tMeo-=usecond();
|
|
||||||
_Mat.Meooe(in,out);
|
_Mat.Meooe(in,out);
|
||||||
_Mat.Meooe(out,tmp);
|
_Mat.Meooe(out,tmp);
|
||||||
tMeo+=usecond();
|
axpby(out,-1.0,mass*mass,tmp,in);
|
||||||
taxpby_norm-=usecond();
|
|
||||||
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
|
|
||||||
taxpby_norm+=usecond();
|
|
||||||
return nn;
|
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag (const Field &in, Field &out){
|
virtual void MpcDag (const Field &in, Field &out){
|
||||||
return Mpc(in,out);
|
Mpc(in,out);
|
||||||
}
|
}
|
||||||
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
|
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
|
||||||
assert(0);// Never need with staggered
|
assert(0);// Never need with staggered
|
||||||
@ -547,7 +513,6 @@ public:
|
|||||||
};
|
};
|
||||||
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
|
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// Base classes for functions of operators
|
// Base classes for functions of operators
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
|
@ -38,16 +38,12 @@ template<class Field> class SparseMatrixBase {
|
|||||||
public:
|
public:
|
||||||
virtual GridBase *Grid(void) =0;
|
virtual GridBase *Grid(void) =0;
|
||||||
// Full checkerboar operations
|
// Full checkerboar operations
|
||||||
virtual RealD M (const Field &in, Field &out)=0;
|
virtual void M (const Field &in, Field &out)=0;
|
||||||
virtual RealD Mdag (const Field &in, Field &out)=0;
|
virtual void Mdag (const Field &in, Field &out)=0;
|
||||||
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
|
|
||||||
Field tmp (in.Grid());
|
|
||||||
ni=M(in,tmp);
|
|
||||||
no=Mdag(tmp,out);
|
|
||||||
}
|
|
||||||
virtual void MdagM(const Field &in, Field &out) {
|
virtual void MdagM(const Field &in, Field &out) {
|
||||||
RealD ni, no;
|
Field tmp (in.Grid());
|
||||||
MdagM(in,out,ni,no);
|
M(in,tmp);
|
||||||
|
Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
virtual void Mdiag (const Field &in, Field &out)=0;
|
virtual void Mdiag (const Field &in, Field &out)=0;
|
||||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
||||||
|
@ -234,10 +234,8 @@ public:
|
|||||||
|
|
||||||
GridBase *grid=in.Grid();
|
GridBase *grid=in.Grid();
|
||||||
|
|
||||||
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
|
|
||||||
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
|
|
||||||
|
|
||||||
int vol=grid->gSites();
|
int vol=grid->gSites();
|
||||||
|
typedef typename Field::vector_type vector_type;
|
||||||
|
|
||||||
Field T0(grid); T0 = in;
|
Field T0(grid); T0 = in;
|
||||||
Field T1(grid);
|
Field T1(grid);
|
||||||
@ -260,12 +258,26 @@ public:
|
|||||||
for(int n=2;n<order;n++){
|
for(int n=2;n<order;n++){
|
||||||
|
|
||||||
Linop.HermOp(*Tn,y);
|
Linop.HermOp(*Tn,y);
|
||||||
// y=xscale*y+mscale*(*Tn);
|
#if 0
|
||||||
// *Tnp=2.0*y-(*Tnm);
|
auto y_v = y.View();
|
||||||
// out=out+Coeffs[n]* (*Tnp);
|
auto Tn_v = Tn->View();
|
||||||
|
auto Tnp_v = Tnp->View();
|
||||||
|
auto Tnm_v = Tnm->View();
|
||||||
|
constexpr int Nsimd = vector_type::Nsimd();
|
||||||
|
accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, {
|
||||||
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
|
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
|
||||||
|
});
|
||||||
|
if ( Coeffs[n] != 0.0) {
|
||||||
|
axpy(out,Coeffs[n],*Tnp,out);
|
||||||
|
}
|
||||||
|
#else
|
||||||
axpby(y,xscale,mscale,y,(*Tn));
|
axpby(y,xscale,mscale,y,(*Tn));
|
||||||
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
|
axpby(*Tnp,2.0,-1.0,y,(*Tnm));
|
||||||
|
if ( Coeffs[n] != 0.0) {
|
||||||
axpy(out,Coeffs[n],*Tnp,out);
|
axpy(out,Coeffs[n],*Tnp,out);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// Cycle pointers to avoid copies
|
// Cycle pointers to avoid copies
|
||||||
Field *swizzle = Tnm;
|
Field *swizzle = Tnm;
|
||||||
Tnm =Tn;
|
Tnm =Tn;
|
||||||
|
@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
bo = beta * omega;
|
bo = beta * omega;
|
||||||
auto p_v = p.View();
|
{
|
||||||
auto r_v = r.View();
|
autoView( p_v , p, AcceleratorWrite);
|
||||||
auto v_v = v.View();
|
autoView( r_v , r, AcceleratorRead);
|
||||||
|
autoView( v_v , v, AcceleratorRead);
|
||||||
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
|
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
|
||||||
});
|
});
|
||||||
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
alpha = rho / Calpha.real();
|
alpha = rho / Calpha.real();
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
auto h_v = h.View();
|
{
|
||||||
auto psi_v = psi.View();
|
autoView( p_v , p, AcceleratorRead);
|
||||||
|
autoView( r_v , r, AcceleratorRead);
|
||||||
|
autoView( v_v , v, AcceleratorRead);
|
||||||
|
autoView( psi_v,psi, AcceleratorRead);
|
||||||
|
autoView( h_v , h, AcceleratorWrite);
|
||||||
|
autoView( s_v , s, AcceleratorWrite);
|
||||||
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
|
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
|
||||||
});
|
});
|
||||||
|
|
||||||
auto s_v = s.View();
|
|
||||||
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
|
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
|
||||||
});
|
});
|
||||||
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
@ -166,11 +172,17 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
omega = Comega.real() / norm2(t);
|
omega = Comega.real() / norm2(t);
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
auto t_v = t.View();
|
{
|
||||||
|
autoView( psi_v,psi, AcceleratorWrite);
|
||||||
|
autoView( r_v , r, AcceleratorWrite);
|
||||||
|
autoView( h_v , h, AcceleratorRead);
|
||||||
|
autoView( s_v , s, AcceleratorRead);
|
||||||
|
autoView( t_v , t, AcceleratorRead);
|
||||||
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
|
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
|
||||||
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
|
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
|
||||||
});
|
});
|
||||||
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
|
|
||||||
cp = norm2(r);
|
cp = norm2(r);
|
||||||
|
@ -140,13 +140,15 @@ public:
|
|||||||
b = cp / c;
|
b = cp / c;
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
auto psi_v = psi.View();
|
{
|
||||||
auto p_v = p.View();
|
autoView( psi_v , psi, AcceleratorWrite);
|
||||||
auto r_v = r.View();
|
autoView( p_v , p, AcceleratorWrite);
|
||||||
|
autoView( r_v , r, AcceleratorWrite);
|
||||||
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
|
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
|
||||||
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
|
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
|
||||||
});
|
});
|
||||||
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
@ -37,211 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
// Move following 100 LOC to lattice/Lattice_basis.h
|
|
||||||
////////////////////////////////////////////////////////
|
|
||||||
template<class Field>
|
|
||||||
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
|
|
||||||
{
|
|
||||||
// If assume basis[j] are already orthonormal,
|
|
||||||
// can take all inner products in parallel saving 2x bandwidth
|
|
||||||
// Save 3x bandwidth on the second line of loop.
|
|
||||||
// perhaps 2.5x speed up.
|
|
||||||
// 2x overall in Multigrid Lanczos
|
|
||||||
for(int j=0; j<k; ++j){
|
|
||||||
auto ip = innerProduct(basis[j],w);
|
|
||||||
w = w - ip*basis[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|
||||||
{
|
|
||||||
typedef decltype(basis[0].View()) View;
|
|
||||||
auto tmp_v = basis[0].View();
|
|
||||||
Vector<View> basis_v(basis.size(),tmp_v);
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
GridBase* grid = basis[0].Grid();
|
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
|
||||||
basis_v[k] = basis[k].View();
|
|
||||||
}
|
|
||||||
#if 0
|
|
||||||
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
|
|
||||||
thread_region
|
|
||||||
{
|
|
||||||
vobj* B = Bt.data() + Nm * thread_num();
|
|
||||||
|
|
||||||
thread_for_in_region(ss, grid->oSites(),{
|
|
||||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
|
||||||
|
|
||||||
for(int j=j0; j<j1; ++j){
|
|
||||||
for(int k=k0; k<k1; ++k){
|
|
||||||
B[j] +=Qt(j,k) * basis_v[k][ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int j=j0; j<j1; ++j){
|
|
||||||
basis_v[j][ss] = B[j];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
|
|
||||||
int nrot = j1-j0;
|
|
||||||
|
|
||||||
|
|
||||||
uint64_t oSites =grid->oSites();
|
|
||||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
|
||||||
|
|
||||||
// printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
|
|
||||||
|
|
||||||
Vector <vobj> Bt(siteBlock * nrot);
|
|
||||||
auto Bp=&Bt[0];
|
|
||||||
|
|
||||||
// GPU readable copy of Eigen matrix
|
|
||||||
Vector<double> Qt_jv(Nm*Nm);
|
|
||||||
double *Qt_p = & Qt_jv[0];
|
|
||||||
for(int k=0;k<Nm;++k){
|
|
||||||
for(int j=0;j<Nm;++j){
|
|
||||||
Qt_p[j*Nm+k]=Qt(j,k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Block the loop to keep storage footprint down
|
|
||||||
vobj zz=Zero();
|
|
||||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
|
||||||
|
|
||||||
// remaining work in this block
|
|
||||||
int ssites=MIN(siteBlock,oSites-s);
|
|
||||||
|
|
||||||
// zero out the accumulators
|
|
||||||
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
|
|
||||||
auto z=coalescedRead(zz);
|
|
||||||
coalescedWrite(Bp[ss],z);
|
|
||||||
});
|
|
||||||
|
|
||||||
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
|
|
||||||
|
|
||||||
int j =sj%nrot;
|
|
||||||
int jj =j0+j;
|
|
||||||
int ss =sj/nrot;
|
|
||||||
int sss=ss+s;
|
|
||||||
|
|
||||||
for(int k=k0; k<k1; ++k){
|
|
||||||
auto tmp = coalescedRead(Bp[ss*nrot+j]);
|
|
||||||
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
|
|
||||||
int j =sj%nrot;
|
|
||||||
int jj =j0+j;
|
|
||||||
int ss =sj/nrot;
|
|
||||||
int sss=ss+s;
|
|
||||||
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract a single rotated vector
|
|
||||||
template<class Field>
|
|
||||||
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
|
||||||
{
|
|
||||||
typedef decltype(basis[0].View()) View;
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
GridBase* grid = basis[0].Grid();
|
|
||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
|
||||||
auto result_v=result.View();
|
|
||||||
Vector<View> basis_v(basis.size(),result_v);
|
|
||||||
for(int k=0;k<basis.size();k++){
|
|
||||||
basis_v[k] = basis[k].View();
|
|
||||||
}
|
|
||||||
vobj zz=Zero();
|
|
||||||
Vector<double> Qt_jv(Nm);
|
|
||||||
double * Qt_j = & Qt_jv[0];
|
|
||||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
|
||||||
auto B=coalescedRead(zz);
|
|
||||||
for(int k=k0; k<k1; ++k){
|
|
||||||
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
|
|
||||||
}
|
|
||||||
coalescedWrite(result_v[ss], B);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
|
|
||||||
{
|
|
||||||
int vlen = idx.size();
|
|
||||||
|
|
||||||
assert(vlen>=1);
|
|
||||||
assert(vlen<=sort_vals.size());
|
|
||||||
assert(vlen<=_v.size());
|
|
||||||
|
|
||||||
for (size_t i=0;i<vlen;i++) {
|
|
||||||
|
|
||||||
if (idx[i] != i) {
|
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
// idx[i] is a table of desired sources giving a permutation.
|
|
||||||
// Swap v[i] with v[idx[i]].
|
|
||||||
// Find j>i for which _vnew[j] = _vold[i],
|
|
||||||
// track the move idx[j] => idx[i]
|
|
||||||
// track the move idx[i] => i
|
|
||||||
//////////////////////////////////////
|
|
||||||
size_t j;
|
|
||||||
for (j=i;j<idx.size();j++)
|
|
||||||
if (idx[j]==i)
|
|
||||||
break;
|
|
||||||
|
|
||||||
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
|
|
||||||
|
|
||||||
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
|
|
||||||
std::swap(sort_vals[i],sort_vals[idx[i]]);
|
|
||||||
|
|
||||||
idx[j] = idx[i];
|
|
||||||
idx[i] = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
|
|
||||||
{
|
|
||||||
std::vector<int> idx(sort_vals.size());
|
|
||||||
std::iota(idx.begin(), idx.end(), 0);
|
|
||||||
|
|
||||||
// sort indexes based on comparing values in v
|
|
||||||
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
|
|
||||||
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
|
|
||||||
});
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
|
|
||||||
{
|
|
||||||
std::vector<int> idx = basisSortGetIndex(sort_vals);
|
|
||||||
if (reverse)
|
|
||||||
std::reverse(idx.begin(), idx.end());
|
|
||||||
|
|
||||||
basisReorderInPlace(_v,sort_vals,idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// PAB: faster to compute the inner products first then fuse loops.
|
|
||||||
// If performance critical can improve.
|
|
||||||
template<class Field>
|
|
||||||
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
|
|
||||||
result = Zero();
|
|
||||||
assert(_v.size()==eval.size());
|
|
||||||
int N = (int)_v.size();
|
|
||||||
for (int i=0;i<N;i++) {
|
|
||||||
Field& tmp = _v[i];
|
|
||||||
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// Implicitly restarted lanczos
|
// Implicitly restarted lanczos
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
|
@ -0,0 +1,241 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef GRID_PREC_GCR_NON_HERM_H
|
||||||
|
#define GRID_PREC_GCR_NON_HERM_H
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//VPGCR Abe and Zhang, 2005.
|
||||||
|
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
|
||||||
|
//Computing and Information Volume 2, Number 2, Pages 147-161
|
||||||
|
//NB. Likely not original reference since they are focussing on a preconditioner variant.
|
||||||
|
// but VPGCR was nicely written up in their paper
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
RealD Tolerance;
|
||||||
|
Integer MaxIterations;
|
||||||
|
int verbose;
|
||||||
|
int mmax;
|
||||||
|
int nstep;
|
||||||
|
int steps;
|
||||||
|
int level;
|
||||||
|
GridStopWatch PrecTimer;
|
||||||
|
GridStopWatch MatTimer;
|
||||||
|
GridStopWatch LinalgTimer;
|
||||||
|
|
||||||
|
LinearFunction<Field> &Preconditioner;
|
||||||
|
LinearOperatorBase<Field> &Linop;
|
||||||
|
|
||||||
|
void Level(int lv) { level=lv; };
|
||||||
|
|
||||||
|
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
|
||||||
|
Tolerance(tol),
|
||||||
|
MaxIterations(maxit),
|
||||||
|
Linop(_Linop),
|
||||||
|
Preconditioner(Prec),
|
||||||
|
mmax(_mmax),
|
||||||
|
nstep(_nstep)
|
||||||
|
{
|
||||||
|
level=1;
|
||||||
|
verbose=1;
|
||||||
|
};
|
||||||
|
|
||||||
|
void operator() (const Field &src, Field &psi){
|
||||||
|
|
||||||
|
psi=Zero();
|
||||||
|
RealD cp, ssq,rsq;
|
||||||
|
ssq=norm2(src);
|
||||||
|
rsq=Tolerance*Tolerance*ssq;
|
||||||
|
|
||||||
|
Field r(src.Grid());
|
||||||
|
|
||||||
|
PrecTimer.Reset();
|
||||||
|
MatTimer.Reset();
|
||||||
|
LinalgTimer.Reset();
|
||||||
|
|
||||||
|
GridStopWatch SolverTimer;
|
||||||
|
SolverTimer.Start();
|
||||||
|
|
||||||
|
steps=0;
|
||||||
|
for(int k=0;k<MaxIterations;k++){
|
||||||
|
|
||||||
|
cp=GCRnStep(src,psi,rsq);
|
||||||
|
|
||||||
|
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
|
||||||
|
|
||||||
|
if(cp<rsq) {
|
||||||
|
|
||||||
|
SolverTimer.Stop();
|
||||||
|
|
||||||
|
Linop.Op(psi,r);
|
||||||
|
axpy(r,-1.0,src,r);
|
||||||
|
RealD tr = norm2(r);
|
||||||
|
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
|
||||||
|
<< " computed residual "<<sqrt(cp/ssq)
|
||||||
|
<< " true residual " <<sqrt(tr/ssq)
|
||||||
|
<< " target " <<Tolerance <<std::endl;
|
||||||
|
|
||||||
|
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
|
||||||
|
// assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
|
||||||
|
|
||||||
|
RealD cp;
|
||||||
|
ComplexD a, b, zAz;
|
||||||
|
RealD zAAz;
|
||||||
|
ComplexD rq;
|
||||||
|
|
||||||
|
GridBase *grid = src.Grid();
|
||||||
|
|
||||||
|
Field r(grid);
|
||||||
|
Field z(grid);
|
||||||
|
Field tmp(grid);
|
||||||
|
Field ttmp(grid);
|
||||||
|
Field Az(grid);
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// history for flexible orthog
|
||||||
|
////////////////////////////////
|
||||||
|
std::vector<Field> q(mmax,grid);
|
||||||
|
std::vector<Field> p(mmax,grid);
|
||||||
|
std::vector<RealD> qq(mmax);
|
||||||
|
|
||||||
|
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
|
||||||
|
|
||||||
|
//////////////////////////////////
|
||||||
|
// initial guess x0 is taken as nonzero.
|
||||||
|
// r0=src-A x0 = src
|
||||||
|
//////////////////////////////////
|
||||||
|
MatTimer.Start();
|
||||||
|
Linop.Op(psi,Az);
|
||||||
|
zAz = innerProduct(Az,psi);
|
||||||
|
zAAz= norm2(Az);
|
||||||
|
MatTimer.Stop();
|
||||||
|
|
||||||
|
|
||||||
|
LinalgTimer.Start();
|
||||||
|
r=src-Az;
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
|
||||||
|
|
||||||
|
/////////////////////
|
||||||
|
// p = Prec(r)
|
||||||
|
/////////////////////
|
||||||
|
|
||||||
|
PrecTimer.Start();
|
||||||
|
Preconditioner(r,z);
|
||||||
|
PrecTimer.Stop();
|
||||||
|
|
||||||
|
MatTimer.Start();
|
||||||
|
Linop.Op(z,Az);
|
||||||
|
MatTimer.Stop();
|
||||||
|
|
||||||
|
LinalgTimer.Start();
|
||||||
|
|
||||||
|
zAz = innerProduct(Az,psi);
|
||||||
|
zAAz= norm2(Az);
|
||||||
|
|
||||||
|
//p[0],q[0],qq[0]
|
||||||
|
p[0]= z;
|
||||||
|
q[0]= Az;
|
||||||
|
qq[0]= zAAz;
|
||||||
|
|
||||||
|
cp =norm2(r);
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
for(int k=0;k<nstep;k++){
|
||||||
|
|
||||||
|
steps++;
|
||||||
|
|
||||||
|
int kp = k+1;
|
||||||
|
int peri_k = k %mmax;
|
||||||
|
int peri_kp= kp%mmax;
|
||||||
|
|
||||||
|
LinalgTimer.Start();
|
||||||
|
rq= innerProduct(q[peri_k],r); // what if rAr not real?
|
||||||
|
a = rq/qq[peri_k];
|
||||||
|
|
||||||
|
axpy(psi,a,p[peri_k],psi);
|
||||||
|
|
||||||
|
cp = axpy_norm(r,-a,q[peri_k],r);
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
|
||||||
|
|
||||||
|
if((k==nstep-1)||(cp<rsq)){
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PrecTimer.Start();
|
||||||
|
Preconditioner(r,z);// solve Az = r
|
||||||
|
PrecTimer.Stop();
|
||||||
|
|
||||||
|
MatTimer.Start();
|
||||||
|
Linop.Op(z,Az);
|
||||||
|
MatTimer.Stop();
|
||||||
|
zAz = innerProduct(Az,psi);
|
||||||
|
zAAz= norm2(Az);
|
||||||
|
|
||||||
|
LinalgTimer.Start();
|
||||||
|
|
||||||
|
q[peri_kp]=Az;
|
||||||
|
p[peri_kp]=z;
|
||||||
|
|
||||||
|
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
|
||||||
|
for(int back=0;back<northog;back++){
|
||||||
|
|
||||||
|
int peri_back=(k-back)%mmax; assert((k-back)>=0);
|
||||||
|
|
||||||
|
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
|
||||||
|
p[peri_kp]=p[peri_kp]+b*p[peri_back];
|
||||||
|
q[peri_kp]=q[peri_kp]+b*q[peri_back];
|
||||||
|
|
||||||
|
}
|
||||||
|
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
}
|
||||||
|
assert(0); // never reached
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
#endif
|
@ -6,72 +6,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||||
bool MemoryProfiler::debug = false;
|
bool MemoryProfiler::debug = false;
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
|
||||||
#define SMALL_LIMIT (0)
|
|
||||||
#else
|
|
||||||
#define SMALL_LIMIT (4096)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef POINTER_CACHE
|
|
||||||
int PointerCache::victim;
|
|
||||||
|
|
||||||
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
|
|
||||||
|
|
||||||
void *PointerCache::Insert(void *ptr,size_t bytes) {
|
|
||||||
|
|
||||||
if (bytes < SMALL_LIMIT ) return ptr;
|
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
assert(omp_in_parallel()==0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void * ret = NULL;
|
|
||||||
int v = -1;
|
|
||||||
|
|
||||||
for(int e=0;e<Ncache;e++) {
|
|
||||||
if ( Entries[e].valid==0 ) {
|
|
||||||
v=e;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( v==-1 ) {
|
|
||||||
v=victim;
|
|
||||||
victim = (victim+1)%Ncache;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( Entries[v].valid ) {
|
|
||||||
ret = Entries[v].address;
|
|
||||||
Entries[v].valid = 0;
|
|
||||||
Entries[v].address = NULL;
|
|
||||||
Entries[v].bytes = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
Entries[v].address=ptr;
|
|
||||||
Entries[v].bytes =bytes;
|
|
||||||
Entries[v].valid =1;
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void *PointerCache::Lookup(size_t bytes) {
|
|
||||||
|
|
||||||
if (bytes < SMALL_LIMIT ) return NULL;
|
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
assert(omp_in_parallel()==0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int e=0;e<Ncache;e++){
|
|
||||||
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
|
|
||||||
Entries[e].valid = 0;
|
|
||||||
return Entries[e].address;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||||
{
|
{
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
|
@ -26,118 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#ifndef GRID_ALIGNED_ALLOCATOR_H
|
#pragma once
|
||||||
#define GRID_ALIGNED_ALLOCATOR_H
|
|
||||||
|
|
||||||
#ifdef HAVE_MALLOC_MALLOC_H
|
|
||||||
#include <malloc/malloc.h>
|
|
||||||
#endif
|
|
||||||
#ifdef HAVE_MALLOC_H
|
|
||||||
#include <malloc.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
|
||||||
#include <mm_malloc.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define POINTER_CACHE
|
|
||||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
// Move control to configure.ac and Config.h?
|
|
||||||
#ifdef POINTER_CACHE
|
|
||||||
class PointerCache {
|
|
||||||
private:
|
|
||||||
/*Pinning pages is costly*/
|
|
||||||
/*Could maintain separate large and small allocation caches*/
|
|
||||||
#ifdef GRID_NVCC
|
|
||||||
static const int Ncache=128;
|
|
||||||
#else
|
|
||||||
static const int Ncache=8;
|
|
||||||
#endif
|
|
||||||
static int victim;
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
void *address;
|
|
||||||
size_t bytes;
|
|
||||||
int valid;
|
|
||||||
} PointerCacheEntry;
|
|
||||||
|
|
||||||
static PointerCacheEntry Entries[Ncache];
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
static void *Insert(void *ptr,size_t bytes) ;
|
|
||||||
static void *Lookup(size_t bytes) ;
|
|
||||||
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::string sizeString(size_t bytes);
|
|
||||||
|
|
||||||
struct MemoryStats
|
|
||||||
{
|
|
||||||
size_t totalAllocated{0}, maxAllocated{0},
|
|
||||||
currentlyAllocated{0}, totalFreed{0};
|
|
||||||
};
|
|
||||||
|
|
||||||
class MemoryProfiler
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
static MemoryStats *stats;
|
|
||||||
static bool debug;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
|
||||||
#define profilerDebugPrint \
|
|
||||||
if (MemoryProfiler::stats) \
|
|
||||||
{ \
|
|
||||||
auto s = MemoryProfiler::stats; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
|
||||||
<< std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
|
||||||
<< std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
|
||||||
<< std::endl; \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
|
||||||
<< std::endl; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define profilerAllocate(bytes) \
|
|
||||||
if (MemoryProfiler::stats) \
|
|
||||||
{ \
|
|
||||||
auto s = MemoryProfiler::stats; \
|
|
||||||
s->totalAllocated += (bytes); \
|
|
||||||
s->currentlyAllocated += (bytes); \
|
|
||||||
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
|
||||||
} \
|
|
||||||
if (MemoryProfiler::debug) \
|
|
||||||
{ \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
|
||||||
profilerDebugPrint; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define profilerFree(bytes) \
|
|
||||||
if (MemoryProfiler::stats) \
|
|
||||||
{ \
|
|
||||||
auto s = MemoryProfiler::stats; \
|
|
||||||
s->totalFreed += (bytes); \
|
|
||||||
s->currentlyAllocated -= (bytes); \
|
|
||||||
} \
|
|
||||||
if (MemoryProfiler::debug) \
|
|
||||||
{ \
|
|
||||||
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
|
||||||
profilerDebugPrint; \
|
|
||||||
}
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
// A lattice of something, but assume the something is SIMDized.
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
template<typename _Tp>
|
template<typename _Tp>
|
||||||
class alignedAllocator {
|
class alignedAllocator {
|
||||||
public:
|
public:
|
||||||
@ -161,70 +53,60 @@ public:
|
|||||||
{
|
{
|
||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
profilerAllocate(bytes);
|
profilerAllocate(bytes);
|
||||||
|
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
||||||
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
#ifdef POINTER_CACHE
|
|
||||||
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
|
|
||||||
#else
|
|
||||||
pointer ptr = nullptr;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
|
||||||
////////////////////////////////////
|
|
||||||
// Unified (managed) memory
|
|
||||||
////////////////////////////////////
|
|
||||||
if ( ptr == (_Tp *) NULL ) {
|
|
||||||
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
|
|
||||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
|
||||||
if( err != cudaSuccess ) {
|
|
||||||
ptr = (_Tp *) NULL;
|
|
||||||
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert( ptr != (_Tp *)NULL);
|
|
||||||
#else
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// 2MB align; could make option probably doesn't need configurability
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
|
||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
|
|
||||||
#else
|
|
||||||
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
|
|
||||||
#endif
|
|
||||||
assert( ptr != (_Tp *)NULL);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
|
||||||
// First touch optimise in threaded loop
|
|
||||||
//////////////////////////////////////////////////
|
|
||||||
uint64_t *cp = (uint64_t *)ptr;
|
|
||||||
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
|
|
||||||
cp[n]=0;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void deallocate(pointer __p, size_type __n) {
|
void deallocate(pointer __p, size_type __n)
|
||||||
|
{
|
||||||
size_type bytes = __n * sizeof(_Tp);
|
size_type bytes = __n * sizeof(_Tp);
|
||||||
|
|
||||||
profilerFree(bytes);
|
profilerFree(bytes);
|
||||||
|
MemoryManager::CpuFree((void *)__p,bytes);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef POINTER_CACHE
|
// FIXME: hack for the copy constructor, eventually it must be avoided
|
||||||
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
|
//void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
||||||
#else
|
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||||
pointer __freeme = __p;
|
void construct(pointer __p) { };
|
||||||
#endif
|
void destroy(pointer __p) { };
|
||||||
|
};
|
||||||
|
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
||||||
|
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
template<typename _Tp>
|
||||||
if ( __freeme ) cudaFree((void *)__freeme);
|
class uvmAllocator {
|
||||||
#else
|
public:
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
typedef std::size_t size_type;
|
||||||
if ( __freeme ) _mm_free((void *)__freeme);
|
typedef std::ptrdiff_t difference_type;
|
||||||
#else
|
typedef _Tp* pointer;
|
||||||
if ( __freeme ) free((void *)__freeme);
|
typedef const _Tp* const_pointer;
|
||||||
#endif
|
typedef _Tp& reference;
|
||||||
#endif
|
typedef const _Tp& const_reference;
|
||||||
|
typedef _Tp value_type;
|
||||||
|
|
||||||
|
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
|
||||||
|
uvmAllocator() throw() { }
|
||||||
|
uvmAllocator(const uvmAllocator&) throw() { }
|
||||||
|
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
|
||||||
|
~uvmAllocator() throw() { }
|
||||||
|
pointer address(reference __x) const { return &__x; }
|
||||||
|
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||||
|
|
||||||
|
pointer allocate(size_type __n, const void* _p= 0)
|
||||||
|
{
|
||||||
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
|
profilerAllocate(bytes);
|
||||||
|
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
|
||||||
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void deallocate(pointer __p, size_type __n)
|
||||||
|
{
|
||||||
|
size_type bytes = __n * sizeof(_Tp);
|
||||||
|
profilerFree(bytes);
|
||||||
|
MemoryManager::SharedFree((void *)__p,bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor, eventually it must be avoided
|
// FIXME: hack for the copy constructor, eventually it must be avoided
|
||||||
@ -233,17 +115,17 @@ public:
|
|||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
|
||||||
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class T> using commAllocator = alignedAllocator<T>;
|
template<class T> using commAllocator = uvmAllocator<T>;
|
||||||
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||||
template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
|
template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
|
||||||
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
|
//template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
|
||||||
|
4
Grid/allocator/Allocator.h
Normal file
4
Grid/allocator/Allocator.h
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <Grid/allocator/MemoryStats.h>
|
||||||
|
#include <Grid/allocator/MemoryManager.h>
|
||||||
|
#include <Grid/allocator/AlignedAllocator.h>
|
244
Grid/allocator/MemoryManager.cc
Normal file
244
Grid/allocator/MemoryManager.cc
Normal file
@ -0,0 +1,244 @@
|
|||||||
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
/*Allocation types, saying which pointer cache should be used*/
|
||||||
|
#define Cpu (0)
|
||||||
|
#define CpuSmall (1)
|
||||||
|
#define Acc (2)
|
||||||
|
#define AccSmall (3)
|
||||||
|
#define Shared (4)
|
||||||
|
#define SharedSmall (5)
|
||||||
|
uint64_t total_shared;
|
||||||
|
uint64_t total_device;
|
||||||
|
uint64_t total_host;;
|
||||||
|
void MemoryManager::PrintBytes(void)
|
||||||
|
{
|
||||||
|
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
|
||||||
|
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
|
||||||
|
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Data tables for recently freed pooiniter caches
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
||||||
|
int MemoryManager::Victim[MemoryManager::NallocType];
|
||||||
|
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Actual allocation and deallocation utils
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr = (void *) Lookup(bytes,Acc);
|
||||||
|
if ( ptr == (void *) NULL ) {
|
||||||
|
ptr = (void *) acceleratorAllocDevice(bytes);
|
||||||
|
total_device+=bytes;
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
|
||||||
|
{
|
||||||
|
void *__freeme = Insert(ptr,bytes,Acc);
|
||||||
|
if ( __freeme ) {
|
||||||
|
acceleratorFreeDevice(__freeme);
|
||||||
|
total_device-=bytes;
|
||||||
|
// PrintBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void *MemoryManager::SharedAllocate(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr = (void *) Lookup(bytes,Shared);
|
||||||
|
if ( ptr == (void *) NULL ) {
|
||||||
|
ptr = (void *) acceleratorAllocShared(bytes);
|
||||||
|
total_shared+=bytes;
|
||||||
|
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
||||||
|
// PrintBytes();
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void MemoryManager::SharedFree (void *ptr,size_t bytes)
|
||||||
|
{
|
||||||
|
void *__freeme = Insert(ptr,bytes,Shared);
|
||||||
|
if ( __freeme ) {
|
||||||
|
acceleratorFreeShared(__freeme);
|
||||||
|
total_shared-=bytes;
|
||||||
|
// PrintBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#ifdef GRID_UVM
|
||||||
|
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||||
|
if ( ptr == (void *) NULL ) {
|
||||||
|
ptr = (void *) acceleratorAllocShared(bytes);
|
||||||
|
total_host+=bytes;
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||||
|
{
|
||||||
|
NotifyDeletion(_ptr);
|
||||||
|
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||||
|
if ( __freeme ) {
|
||||||
|
acceleratorFreeShared(__freeme);
|
||||||
|
total_host-=bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||||
|
if ( ptr == (void *) NULL ) {
|
||||||
|
ptr = (void *) acceleratorAllocCpu(bytes);
|
||||||
|
total_host+=bytes;
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||||
|
{
|
||||||
|
NotifyDeletion(_ptr);
|
||||||
|
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||||
|
if ( __freeme ) {
|
||||||
|
acceleratorFreeCpu(__freeme);
|
||||||
|
total_host-=bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//////////////////////////////////////////
|
||||||
|
// call only once
|
||||||
|
//////////////////////////////////////////
|
||||||
|
void MemoryManager::Init(void)
|
||||||
|
{
|
||||||
|
|
||||||
|
char * str;
|
||||||
|
int Nc;
|
||||||
|
int NcS;
|
||||||
|
|
||||||
|
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||||
|
if ( str ) {
|
||||||
|
Nc = atoi(str);
|
||||||
|
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||||
|
Ncache[Cpu]=Nc;
|
||||||
|
Ncache[Acc]=Nc;
|
||||||
|
Ncache[Shared]=Nc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||||
|
if ( str ) {
|
||||||
|
Nc = atoi(str);
|
||||||
|
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||||
|
Ncache[CpuSmall]=Nc;
|
||||||
|
Ncache[AccSmall]=Nc;
|
||||||
|
Ncache[SharedSmall]=Nc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
||||||
|
#ifdef ALLOCATION_CACHE
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_UVM
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||||
|
{
|
||||||
|
#ifdef ALLOCATION_CACHE
|
||||||
|
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||||
|
int cache = type + small;
|
||||||
|
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
|
||||||
|
#else
|
||||||
|
return ptr;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
|
||||||
|
{
|
||||||
|
assert(ncache>0);
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
assert(omp_in_parallel()==0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void * ret = NULL;
|
||||||
|
int v = -1;
|
||||||
|
|
||||||
|
for(int e=0;e<ncache;e++) {
|
||||||
|
if ( entries[e].valid==0 ) {
|
||||||
|
v=e;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( v==-1 ) {
|
||||||
|
v=victim;
|
||||||
|
victim = (victim+1)%ncache;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( entries[v].valid ) {
|
||||||
|
ret = entries[v].address;
|
||||||
|
entries[v].valid = 0;
|
||||||
|
entries[v].address = NULL;
|
||||||
|
entries[v].bytes = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
entries[v].address=ptr;
|
||||||
|
entries[v].bytes =bytes;
|
||||||
|
entries[v].valid =1;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *MemoryManager::Lookup(size_t bytes,int type)
|
||||||
|
{
|
||||||
|
#ifdef ALLOCATION_CACHE
|
||||||
|
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||||
|
int cache = type+small;
|
||||||
|
return Lookup(bytes,Entries[cache],Ncache[cache]);
|
||||||
|
#else
|
||||||
|
return NULL;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
|
||||||
|
{
|
||||||
|
assert(ncache>0);
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
assert(omp_in_parallel()==0);
|
||||||
|
#endif
|
||||||
|
for(int e=0;e<ncache;e++){
|
||||||
|
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||||
|
entries[e].valid = 0;
|
||||||
|
return entries[e].address;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
181
Grid/allocator/MemoryManager.h
Normal file
181
Grid/allocator/MemoryManager.h
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/MemoryManager.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#pragma once
|
||||||
|
#include <list>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
// Move control to configure.ac and Config.h?
|
||||||
|
|
||||||
|
#define ALLOCATION_CACHE
|
||||||
|
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
||||||
|
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||||
|
|
||||||
|
/*Pinning pages is costly*/
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Advise the LatticeAccelerator class
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
enum ViewAdvise {
|
||||||
|
AdviseDefault = 0x0, // Regular data
|
||||||
|
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
|
||||||
|
// significantly influence performance of bulk storage.
|
||||||
|
|
||||||
|
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
|
||||||
|
// enables read-only copies of memory to be kept on
|
||||||
|
// host and device.
|
||||||
|
|
||||||
|
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// View Access Mode
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
enum ViewMode {
|
||||||
|
AcceleratorRead = 0x01,
|
||||||
|
AcceleratorWrite = 0x02,
|
||||||
|
AcceleratorWriteDiscard = 0x04,
|
||||||
|
CpuRead = 0x08,
|
||||||
|
CpuWrite = 0x10,
|
||||||
|
CpuWriteDiscard = 0x10 // same for now
|
||||||
|
};
|
||||||
|
|
||||||
|
class MemoryManager {
|
||||||
|
private:
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// For caching recently freed allocations
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
typedef struct {
|
||||||
|
void *address;
|
||||||
|
size_t bytes;
|
||||||
|
int valid;
|
||||||
|
} AllocationCacheEntry;
|
||||||
|
|
||||||
|
static const int NallocCacheMax=128;
|
||||||
|
static const int NallocType=6;
|
||||||
|
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
||||||
|
static int Victim[NallocType];
|
||||||
|
static int Ncache[NallocType];
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Free pool
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
static void *Insert(void *ptr,size_t bytes,int type) ;
|
||||||
|
static void *Lookup(size_t bytes,int type) ;
|
||||||
|
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
||||||
|
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
||||||
|
|
||||||
|
static void *AcceleratorAllocate(size_t bytes);
|
||||||
|
static void AcceleratorFree (void *ptr,size_t bytes);
|
||||||
|
static void PrintBytes(void);
|
||||||
|
public:
|
||||||
|
static void Init(void);
|
||||||
|
static void *SharedAllocate(size_t bytes);
|
||||||
|
static void SharedFree (void *ptr,size_t bytes);
|
||||||
|
static void *CpuAllocate(size_t bytes);
|
||||||
|
static void CpuFree (void *ptr,size_t bytes);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
// Footprint tracking
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
static uint64_t DeviceBytes;
|
||||||
|
static uint64_t DeviceLRUBytes;
|
||||||
|
static uint64_t DeviceMaxBytes;
|
||||||
|
static uint64_t HostToDeviceBytes;
|
||||||
|
static uint64_t DeviceToHostBytes;
|
||||||
|
static uint64_t HostToDeviceXfer;
|
||||||
|
static uint64_t DeviceToHostXfer;
|
||||||
|
|
||||||
|
private:
|
||||||
|
#ifndef GRID_UVM
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
// Data tables for ViewCache
|
||||||
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
typedef std::list<uint64_t> LRU_t;
|
||||||
|
typedef typename LRU_t::iterator LRUiterator;
|
||||||
|
typedef struct {
|
||||||
|
int LRU_valid;
|
||||||
|
LRUiterator LRU_entry;
|
||||||
|
uint64_t CpuPtr;
|
||||||
|
uint64_t AccPtr;
|
||||||
|
size_t bytes;
|
||||||
|
uint32_t transient;
|
||||||
|
uint32_t state;
|
||||||
|
uint32_t accLock;
|
||||||
|
uint32_t cpuLock;
|
||||||
|
} AcceleratorViewEntry;
|
||||||
|
|
||||||
|
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
|
||||||
|
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
|
||||||
|
|
||||||
|
static AccViewTable_t AccViewTable;
|
||||||
|
static LRU_t LRU;
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Device motion
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||||
|
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
|
||||||
|
static void Evict(AcceleratorViewEntry &AccCache);
|
||||||
|
static void Flush(AcceleratorViewEntry &AccCache);
|
||||||
|
static void Clone(AcceleratorViewEntry &AccCache);
|
||||||
|
static void AccDiscard(AcceleratorViewEntry &AccCache);
|
||||||
|
static void CpuDiscard(AcceleratorViewEntry &AccCache);
|
||||||
|
|
||||||
|
// static void LRUupdate(AcceleratorViewEntry &AccCache);
|
||||||
|
static void LRUinsert(AcceleratorViewEntry &AccCache);
|
||||||
|
static void LRUremove(AcceleratorViewEntry &AccCache);
|
||||||
|
|
||||||
|
// manage entries in the table
|
||||||
|
static int EntryPresent(uint64_t CpuPtr);
|
||||||
|
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||||
|
static void EntryErase (uint64_t CpuPtr);
|
||||||
|
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
|
||||||
|
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
|
||||||
|
|
||||||
|
static void AcceleratorViewClose(uint64_t AccPtr);
|
||||||
|
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||||
|
static void CpuViewClose(uint64_t Ptr);
|
||||||
|
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||||
|
#endif
|
||||||
|
static void NotifyDeletion(void * CpuPtr);
|
||||||
|
|
||||||
|
public:
|
||||||
|
static void Print(void);
|
||||||
|
static int isOpen (void* CpuPtr);
|
||||||
|
static void ViewClose(void* CpuPtr,ViewMode mode);
|
||||||
|
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
468
Grid/allocator/MemoryManagerCache.cc
Normal file
468
Grid/allocator/MemoryManagerCache.cc
Normal file
@ -0,0 +1,468 @@
|
|||||||
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
|
#ifndef GRID_UVM
|
||||||
|
|
||||||
|
#warning "Using explicit device memory copies"
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
#define dprintf(...)
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// For caching copies of data on device
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
|
||||||
|
MemoryManager::LRU_t MemoryManager::LRU;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
// Footprint tracking
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
uint64_t MemoryManager::DeviceBytes;
|
||||||
|
uint64_t MemoryManager::DeviceLRUBytes;
|
||||||
|
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
|
||||||
|
uint64_t MemoryManager::HostToDeviceBytes;
|
||||||
|
uint64_t MemoryManager::DeviceToHostBytes;
|
||||||
|
uint64_t MemoryManager::HostToDeviceXfer;
|
||||||
|
uint64_t MemoryManager::DeviceToHostXfer;
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Priority ordering for unlocked entries
|
||||||
|
// Empty
|
||||||
|
// CpuDirty
|
||||||
|
// Consistent
|
||||||
|
// AccDirty
|
||||||
|
////////////////////////////////////
|
||||||
|
#define Empty (0x0) /*Entry unoccupied */
|
||||||
|
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
|
||||||
|
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
|
||||||
|
#define AccDirty (0x4) /*ACC copy is golden */
|
||||||
|
#define EvictNext (0x8) /*Priority for eviction*/
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Mechanics of data table maintenance
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
int MemoryManager::EntryPresent(uint64_t CpuPtr)
|
||||||
|
{
|
||||||
|
if(AccViewTable.empty()) return 0;
|
||||||
|
|
||||||
|
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
||||||
|
{
|
||||||
|
assert(!EntryPresent(CpuPtr));
|
||||||
|
AcceleratorViewEntry AccCache;
|
||||||
|
AccCache.CpuPtr = CpuPtr;
|
||||||
|
AccCache.AccPtr = (uint64_t)NULL;
|
||||||
|
AccCache.bytes = bytes;
|
||||||
|
AccCache.state = CpuDirty;
|
||||||
|
AccCache.LRU_valid=0;
|
||||||
|
AccCache.transient=0;
|
||||||
|
AccCache.accLock=0;
|
||||||
|
AccCache.cpuLock=0;
|
||||||
|
AccViewTable[CpuPtr] = AccCache;
|
||||||
|
}
|
||||||
|
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
|
||||||
|
{
|
||||||
|
assert(EntryPresent(CpuPtr));
|
||||||
|
auto AccCacheIterator = AccViewTable.find(CpuPtr);
|
||||||
|
assert(AccCacheIterator!=AccViewTable.end());
|
||||||
|
return AccCacheIterator;
|
||||||
|
}
|
||||||
|
void MemoryManager::EntryErase(uint64_t CpuPtr)
|
||||||
|
{
|
||||||
|
auto AccCache = EntryLookup(CpuPtr);
|
||||||
|
AccViewTable.erase(CpuPtr);
|
||||||
|
}
|
||||||
|
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
|
||||||
|
{
|
||||||
|
assert(AccCache.LRU_valid==0);
|
||||||
|
if (AccCache.transient) {
|
||||||
|
LRU.push_back(AccCache.CpuPtr);
|
||||||
|
AccCache.LRU_entry = --LRU.end();
|
||||||
|
} else {
|
||||||
|
LRU.push_front(AccCache.CpuPtr);
|
||||||
|
AccCache.LRU_entry = LRU.begin();
|
||||||
|
}
|
||||||
|
AccCache.LRU_valid = 1;
|
||||||
|
DeviceLRUBytes+=AccCache.bytes;
|
||||||
|
}
|
||||||
|
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
|
||||||
|
{
|
||||||
|
assert(AccCache.LRU_valid==1);
|
||||||
|
LRU.erase(AccCache.LRU_entry);
|
||||||
|
AccCache.LRU_valid = 0;
|
||||||
|
DeviceLRUBytes-=AccCache.bytes;
|
||||||
|
}
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
// Accelerator cache motion & consistency logic
|
||||||
|
/////////////////////////////////////////////////
|
||||||
|
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||||
|
{
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// Remove from Accelerator, remove entry, without flush
|
||||||
|
// Cannot be locked. If allocated Must be in LRU pool.
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
|
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
|
assert(AccCache.accLock==0);
|
||||||
|
assert(AccCache.cpuLock==0);
|
||||||
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
|
if(AccCache.AccPtr) {
|
||||||
|
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
|
DeviceBytes -=AccCache.bytes;
|
||||||
|
LRUremove(AccCache);
|
||||||
|
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||||
|
}
|
||||||
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
|
EntryErase(CpuPtr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||||
|
{
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Make CPU consistent, remove from Accelerator, remove entry
|
||||||
|
// Cannot be locked. If allocated must be in LRU pool.
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
|
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
|
assert(AccCache.accLock==0);
|
||||||
|
assert(AccCache.cpuLock==0);
|
||||||
|
if(AccCache.state==AccDirty) {
|
||||||
|
Flush(AccCache);
|
||||||
|
}
|
||||||
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
|
if(AccCache.AccPtr) {
|
||||||
|
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
|
DeviceBytes -=AccCache.bytes;
|
||||||
|
LRUremove(AccCache);
|
||||||
|
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||||
|
}
|
||||||
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
|
EntryErase(CpuPtr);
|
||||||
|
}
|
||||||
|
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
||||||
|
{
|
||||||
|
assert(AccCache.state==AccDirty);
|
||||||
|
assert(AccCache.cpuLock==0);
|
||||||
|
assert(AccCache.accLock==0);
|
||||||
|
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||||
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
|
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||||
|
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
|
DeviceToHostBytes+=AccCache.bytes;
|
||||||
|
DeviceToHostXfer++;
|
||||||
|
AccCache.state=Consistent;
|
||||||
|
}
|
||||||
|
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
||||||
|
{
|
||||||
|
assert(AccCache.state==CpuDirty);
|
||||||
|
assert(AccCache.cpuLock==0);
|
||||||
|
assert(AccCache.accLock==0);
|
||||||
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
|
if(AccCache.AccPtr==(uint64_t)NULL){
|
||||||
|
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||||
|
DeviceBytes+=AccCache.bytes;
|
||||||
|
}
|
||||||
|
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
|
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
|
HostToDeviceBytes+=AccCache.bytes;
|
||||||
|
HostToDeviceXfer++;
|
||||||
|
AccCache.state=Consistent;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
||||||
|
{
|
||||||
|
assert(AccCache.state!=Empty);
|
||||||
|
assert(AccCache.cpuLock==0);
|
||||||
|
assert(AccCache.accLock==0);
|
||||||
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
|
if(AccCache.AccPtr==(uint64_t)NULL){
|
||||||
|
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||||
|
DeviceBytes+=AccCache.bytes;
|
||||||
|
}
|
||||||
|
AccCache.state=AccDirty;
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// View management
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||||
|
{
|
||||||
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
|
AcceleratorViewClose((uint64_t)Ptr);
|
||||||
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
|
CpuViewClose((uint64_t)Ptr);
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
||||||
|
{
|
||||||
|
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||||
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
|
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
|
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||||
|
{
|
||||||
|
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||||
|
if ( DeviceLRUBytes > 0){
|
||||||
|
assert(LRU.size()>0);
|
||||||
|
uint64_t victim = LRU.back();
|
||||||
|
auto AccCacheIterator = EntryLookup(victim);
|
||||||
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
Evict(AccCache);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
|
||||||
|
{
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Find if present, otherwise get or force an empty
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
if ( EntryPresent(CpuPtr)==0 ){
|
||||||
|
EvictVictims(bytes);
|
||||||
|
EntryCreate(CpuPtr,bytes,mode,hint);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
|
||||||
|
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
|
||||||
|
|
||||||
|
assert(AccCache.cpuLock==0); // Programming error
|
||||||
|
|
||||||
|
if(AccCache.state!=Empty) {
|
||||||
|
assert(AccCache.CpuPtr == CpuPtr);
|
||||||
|
assert(AccCache.bytes ==bytes);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* State transitions and actions
|
||||||
|
*
|
||||||
|
* Action State StateNext Flush Clone
|
||||||
|
*
|
||||||
|
* AccRead Empty Consistent - Y
|
||||||
|
* AccWrite Empty AccDirty - Y
|
||||||
|
* AccRead CpuDirty Consistent - Y
|
||||||
|
* AccWrite CpuDirty AccDirty - Y
|
||||||
|
* AccRead Consistent Consistent - -
|
||||||
|
* AccWrite Consistent AccDirty - -
|
||||||
|
* AccRead AccDirty AccDirty - -
|
||||||
|
* AccWrite AccDirty AccDirty - -
|
||||||
|
*/
|
||||||
|
if(AccCache.state==Empty) {
|
||||||
|
assert(AccCache.LRU_valid==0);
|
||||||
|
AccCache.CpuPtr = CpuPtr;
|
||||||
|
AccCache.AccPtr = (uint64_t)NULL;
|
||||||
|
AccCache.bytes = bytes;
|
||||||
|
AccCache.state = CpuDirty; // Cpu starts primary
|
||||||
|
if(mode==AcceleratorWriteDiscard){
|
||||||
|
CpuDiscard(AccCache);
|
||||||
|
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
|
||||||
|
} else if(mode==AcceleratorWrite){
|
||||||
|
Clone(AccCache);
|
||||||
|
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
|
||||||
|
} else {
|
||||||
|
Clone(AccCache);
|
||||||
|
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||||
|
}
|
||||||
|
AccCache.accLock= 1;
|
||||||
|
} else if(AccCache.state==CpuDirty ){
|
||||||
|
if(mode==AcceleratorWriteDiscard) {
|
||||||
|
CpuDiscard(AccCache);
|
||||||
|
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
|
||||||
|
} else if(mode==AcceleratorWrite) {
|
||||||
|
Clone(AccCache);
|
||||||
|
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
|
||||||
|
} else {
|
||||||
|
Clone(AccCache);
|
||||||
|
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||||
|
}
|
||||||
|
AccCache.accLock++;
|
||||||
|
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
|
||||||
|
} else if(AccCache.state==Consistent) {
|
||||||
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
|
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||||
|
else
|
||||||
|
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||||
|
AccCache.accLock++;
|
||||||
|
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
|
||||||
|
} else if(AccCache.state==AccDirty) {
|
||||||
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
|
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||||
|
else
|
||||||
|
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||||
|
AccCache.accLock++;
|
||||||
|
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If view is opened on device remove from LRU
|
||||||
|
if(AccCache.LRU_valid==1){
|
||||||
|
// must possibly remove from LRU as now locked on GPU
|
||||||
|
LRUremove(AccCache);
|
||||||
|
}
|
||||||
|
|
||||||
|
int transient =hint;
|
||||||
|
AccCache.transient= transient? EvictNext : 0;
|
||||||
|
|
||||||
|
return AccCache.AccPtr;
|
||||||
|
}
|
||||||
|
////////////////////////////////////
|
||||||
|
// look up & decrement lock count
|
||||||
|
////////////////////////////////////
|
||||||
|
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
||||||
|
{
|
||||||
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
|
||||||
|
assert(AccCache.cpuLock==0);
|
||||||
|
assert(AccCache.accLock>0);
|
||||||
|
|
||||||
|
AccCache.accLock--;
|
||||||
|
|
||||||
|
// Move to LRU queue if not locked and close on device
|
||||||
|
if(AccCache.accLock==0) {
|
||||||
|
LRUinsert(AccCache);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||||
|
{
|
||||||
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
|
||||||
|
assert(AccCache.cpuLock>0);
|
||||||
|
assert(AccCache.accLock==0);
|
||||||
|
|
||||||
|
AccCache.cpuLock--;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Action State StateNext Flush Clone
|
||||||
|
*
|
||||||
|
* CpuRead Empty CpuDirty - -
|
||||||
|
* CpuWrite Empty CpuDirty - -
|
||||||
|
* CpuRead CpuDirty CpuDirty - -
|
||||||
|
* CpuWrite CpuDirty CpuDirty - -
|
||||||
|
* CpuRead Consistent Consistent - -
|
||||||
|
* CpuWrite Consistent CpuDirty - -
|
||||||
|
* CpuRead AccDirty Consistent Y -
|
||||||
|
* CpuWrite AccDirty CpuDirty Y -
|
||||||
|
*/
|
||||||
|
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
|
||||||
|
{
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Find if present, otherwise get or force an empty
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
if ( EntryPresent(CpuPtr)==0 ){
|
||||||
|
EvictVictims(bytes);
|
||||||
|
EntryCreate(CpuPtr,bytes,mode,transient);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
|
||||||
|
assert((mode==CpuRead)||(mode==CpuWrite));
|
||||||
|
assert(AccCache.accLock==0); // Programming error
|
||||||
|
|
||||||
|
if(AccCache.state!=Empty) {
|
||||||
|
assert(AccCache.CpuPtr == CpuPtr);
|
||||||
|
assert(AccCache.bytes==bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(AccCache.state==Empty) {
|
||||||
|
AccCache.CpuPtr = CpuPtr;
|
||||||
|
AccCache.AccPtr = (uint64_t)NULL;
|
||||||
|
AccCache.bytes = bytes;
|
||||||
|
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
|
||||||
|
AccCache.accLock= 0;
|
||||||
|
AccCache.cpuLock= 1;
|
||||||
|
} else if(AccCache.state==CpuDirty ){
|
||||||
|
// AccPtr dont care, deferred allocate
|
||||||
|
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
|
||||||
|
AccCache.cpuLock++;
|
||||||
|
} else if(AccCache.state==Consistent) {
|
||||||
|
assert(AccCache.AccPtr != (uint64_t)NULL);
|
||||||
|
if(mode==CpuWrite)
|
||||||
|
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
|
||||||
|
else
|
||||||
|
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
|
||||||
|
AccCache.cpuLock++;
|
||||||
|
} else if(AccCache.state==AccDirty) {
|
||||||
|
assert(AccCache.AccPtr != (uint64_t)NULL);
|
||||||
|
Flush(AccCache);
|
||||||
|
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
|
||||||
|
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
|
||||||
|
AccCache.cpuLock++;
|
||||||
|
} else {
|
||||||
|
assert(0); // should be unreachable
|
||||||
|
}
|
||||||
|
|
||||||
|
AccCache.transient= transient? EvictNext : 0;
|
||||||
|
|
||||||
|
return AccCache.CpuPtr;
|
||||||
|
}
|
||||||
|
void MemoryManager::NotifyDeletion(void *_ptr)
|
||||||
|
{
|
||||||
|
// Look up in ViewCache
|
||||||
|
uint64_t ptr = (uint64_t)_ptr;
|
||||||
|
if(EntryPresent(ptr)) {
|
||||||
|
auto e = EntryLookup(ptr);
|
||||||
|
AccDiscard(e->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void MemoryManager::Print(void)
|
||||||
|
{
|
||||||
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
|
std::cout << GridLogDebug << "Memory Manager " << std::endl;
|
||||||
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
|
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
|
||||||
|
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
|
||||||
|
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
|
||||||
|
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
|
||||||
|
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
|
||||||
|
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
|
||||||
|
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
|
||||||
|
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
|
||||||
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
|
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
||||||
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
|
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
|
||||||
|
auto &AccCache = it->second;
|
||||||
|
|
||||||
|
std::string str;
|
||||||
|
if ( AccCache.state==Empty ) str = std::string("Empty");
|
||||||
|
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
|
||||||
|
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
|
||||||
|
if ( AccCache.state==Consistent)str = std::string("Consistent");
|
||||||
|
|
||||||
|
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||||
|
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||||
|
<< "\t" << AccCache.cpuLock
|
||||||
|
<< "\t" << AccCache.accLock
|
||||||
|
<< "\t" << AccCache.LRU_valid<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
|
|
||||||
|
};
|
||||||
|
int MemoryManager::isOpen (void* _CpuPtr)
|
||||||
|
{
|
||||||
|
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||||
|
if ( EntryPresent(CpuPtr) ){
|
||||||
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
return AccCache.cpuLock+AccCache.accLock;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#endif
|
24
Grid/allocator/MemoryManagerShared.cc
Normal file
24
Grid/allocator/MemoryManagerShared.cc
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#include <Grid/GridCore.h>
|
||||||
|
#ifdef GRID_UVM
|
||||||
|
|
||||||
|
#warning "Grid is assuming unified virtual memory address space"
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// View management is 1:1 address space mapping
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
uint64_t MemoryManager::DeviceBytes;
|
||||||
|
uint64_t MemoryManager::DeviceLRUBytes;
|
||||||
|
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
|
||||||
|
uint64_t MemoryManager::HostToDeviceBytes;
|
||||||
|
uint64_t MemoryManager::DeviceToHostBytes;
|
||||||
|
uint64_t MemoryManager::HostToDeviceXfer;
|
||||||
|
uint64_t MemoryManager::DeviceToHostXfer;
|
||||||
|
|
||||||
|
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
||||||
|
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
|
||||||
|
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
|
||||||
|
void MemoryManager::Print(void){};
|
||||||
|
void MemoryManager::NotifyDeletion(void *ptr){};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
#endif
|
67
Grid/allocator/MemoryStats.cc
Normal file
67
Grid/allocator/MemoryStats.cc
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#include <Grid/GridCore.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
MemoryStats *MemoryProfiler::stats = nullptr;
|
||||||
|
bool MemoryProfiler::debug = false;
|
||||||
|
|
||||||
|
void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||||
|
{
|
||||||
|
#ifdef __linux__
|
||||||
|
int fd = open("/proc/self/pagemap", O_RDONLY);
|
||||||
|
assert(fd >= 0);
|
||||||
|
const int page_size = 4096;
|
||||||
|
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||||
|
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||||
|
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||||
|
uint64_t pagedata[npages];
|
||||||
|
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||||
|
assert(ret == offset);
|
||||||
|
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||||
|
assert(ret == sizeof(uint64_t) * npages);
|
||||||
|
int nhugepages = npages / 512;
|
||||||
|
int n4ktotal, nnothuge;
|
||||||
|
n4ktotal = 0;
|
||||||
|
nnothuge = 0;
|
||||||
|
for (int i = 0; i < nhugepages; ++i) {
|
||||||
|
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
||||||
|
for (int j = 0; j < 512; ++j) {
|
||||||
|
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
||||||
|
++n4ktotal;
|
||||||
|
if (pageaddr != baseaddr + j * page_size)
|
||||||
|
++nnothuge;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int rank = CartesianCommunicator::RankWorld();
|
||||||
|
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string sizeString(const size_t bytes)
|
||||||
|
{
|
||||||
|
constexpr unsigned int bufSize = 256;
|
||||||
|
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
||||||
|
char buf[256];
|
||||||
|
size_t s = 0;
|
||||||
|
double count = bytes;
|
||||||
|
|
||||||
|
while (count >= 1024 && s < 7)
|
||||||
|
{
|
||||||
|
s++;
|
||||||
|
count /= 1024;
|
||||||
|
}
|
||||||
|
if (count - floor(count) == 0.0)
|
||||||
|
{
|
||||||
|
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::string(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
95
Grid/allocator/MemoryStats.h
Normal file
95
Grid/allocator/MemoryStats.h
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/MemoryStats.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
std::string sizeString(size_t bytes);
|
||||||
|
|
||||||
|
struct MemoryStats
|
||||||
|
{
|
||||||
|
size_t totalAllocated{0}, maxAllocated{0},
|
||||||
|
currentlyAllocated{0}, totalFreed{0};
|
||||||
|
};
|
||||||
|
|
||||||
|
class MemoryProfiler
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static MemoryStats *stats;
|
||||||
|
static bool debug;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
|
||||||
|
#define profilerDebugPrint \
|
||||||
|
if (MemoryProfiler::stats) \
|
||||||
|
{ \
|
||||||
|
auto s = MemoryProfiler::stats; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
|
||||||
|
<< std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
|
||||||
|
<< std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
|
||||||
|
<< std::endl; \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
|
||||||
|
<< std::endl; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define profilerAllocate(bytes) \
|
||||||
|
if (MemoryProfiler::stats) \
|
||||||
|
{ \
|
||||||
|
auto s = MemoryProfiler::stats; \
|
||||||
|
s->totalAllocated += (bytes); \
|
||||||
|
s->currentlyAllocated += (bytes); \
|
||||||
|
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
|
||||||
|
} \
|
||||||
|
if (MemoryProfiler::debug) \
|
||||||
|
{ \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
|
||||||
|
profilerDebugPrint; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define profilerFree(bytes) \
|
||||||
|
if (MemoryProfiler::stats) \
|
||||||
|
{ \
|
||||||
|
auto s = MemoryProfiler::stats; \
|
||||||
|
s->totalFreed += (bytes); \
|
||||||
|
s->currentlyAllocated -= (bytes); \
|
||||||
|
} \
|
||||||
|
if (MemoryProfiler::debug) \
|
||||||
|
{ \
|
||||||
|
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
|
||||||
|
profilerDebugPrint; \
|
||||||
|
}
|
||||||
|
|
||||||
|
void check_huge_pages(void *Buf,uint64_t BYTES);
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
@ -81,6 +81,7 @@ public:
|
|||||||
|
|
||||||
bool _isCheckerBoarded;
|
bool _isCheckerBoarded;
|
||||||
int LocallyPeriodic;
|
int LocallyPeriodic;
|
||||||
|
Coordinate _checker_dim_mask;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -38,6 +38,7 @@ class GridCartesian: public GridBase {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
int dummy;
|
int dummy;
|
||||||
|
Coordinate _checker_dim_mask;
|
||||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -104,6 +105,7 @@ public:
|
|||||||
_ldimensions.resize(_ndimension);
|
_ldimensions.resize(_ndimension);
|
||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_simd_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
|
_checker_dim_mask.resize(_ndimension);;
|
||||||
_lstart.resize(_ndimension);
|
_lstart.resize(_ndimension);
|
||||||
_lend.resize(_ndimension);
|
_lend.resize(_ndimension);
|
||||||
|
|
||||||
@ -114,6 +116,8 @@ public:
|
|||||||
|
|
||||||
for (int d = 0; d < _ndimension; d++)
|
for (int d = 0; d < _ndimension; d++)
|
||||||
{
|
{
|
||||||
|
_checker_dim_mask[d]=0;
|
||||||
|
|
||||||
_fdimensions[d] = dimensions[d]; // Global dimensions
|
_fdimensions[d] = dimensions[d]; // Global dimensions
|
||||||
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
_gdimensions[d] = _fdimensions[d]; // Global dimensions
|
||||||
_simd_layout[d] = simd_layout[d];
|
_simd_layout[d] = simd_layout[d];
|
||||||
|
@ -36,11 +36,27 @@ static const int CbBlack=1;
|
|||||||
static const int Even =CbRed;
|
static const int Even =CbRed;
|
||||||
static const int Odd =CbBlack;
|
static const int Odd =CbBlack;
|
||||||
|
|
||||||
|
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
|
||||||
|
{
|
||||||
|
int nd=rdim.size();
|
||||||
|
Coordinate coor(nd);
|
||||||
|
|
||||||
|
Lexicographic::CoorFromIndex(coor,oindex,rdim);
|
||||||
|
|
||||||
|
int linear=0;
|
||||||
|
for(int d=0;d<nd;d++){
|
||||||
|
if(chk_dim_msk[d])
|
||||||
|
linear=linear+coor[d];
|
||||||
|
}
|
||||||
|
return (linear&0x1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Specialise this for red black grids storing half the data like a chess board.
|
// Specialise this for red black grids storing half the data like a chess board.
|
||||||
class GridRedBlackCartesian : public GridBase
|
class GridRedBlackCartesian : public GridBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Coordinate _checker_dim_mask;
|
// Coordinate _checker_dim_mask;
|
||||||
int _checker_dim;
|
int _checker_dim;
|
||||||
std::vector<int> _checker_board;
|
std::vector<int> _checker_board;
|
||||||
|
|
||||||
|
@ -114,6 +114,7 @@ public:
|
|||||||
void GlobalSumVector(RealD *,int N);
|
void GlobalSumVector(RealD *,int N);
|
||||||
void GlobalSum(uint32_t &);
|
void GlobalSum(uint32_t &);
|
||||||
void GlobalSum(uint64_t &);
|
void GlobalSum(uint64_t &);
|
||||||
|
void GlobalSumVector(uint64_t*,int N);
|
||||||
void GlobalSum(ComplexF &c);
|
void GlobalSum(ComplexF &c);
|
||||||
void GlobalSumVector(ComplexF *c,int N);
|
void GlobalSumVector(ComplexF *c,int N);
|
||||||
void GlobalSum(ComplexD &c);
|
void GlobalSum(ComplexD &c);
|
||||||
|
@ -275,6 +275,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
|
|||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){}
|
|||||||
void CartesianCommunicator::GlobalSum(float &){}
|
void CartesianCommunicator::GlobalSum(float &){}
|
||||||
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
||||||
void CartesianCommunicator::GlobalSum(double &){}
|
void CartesianCommunicator::GlobalSum(double &){}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(double *,int N){}
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &){}
|
void CartesianCommunicator::GlobalSum(uint32_t &){}
|
||||||
void CartesianCommunicator::GlobalSum(uint64_t &){}
|
void CartesianCommunicator::GlobalSum(uint64_t &){}
|
||||||
void CartesianCommunicator::GlobalSumVector(double *,int N){}
|
void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
|
||||||
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
||||||
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
||||||
|
|
||||||
|
@ -74,7 +74,9 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
|
|||||||
if (heap_bytes >= heap_size) {
|
if (heap_bytes >= heap_size) {
|
||||||
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||||
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||||
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
|
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
|
||||||
assert(heap_bytes<heap_size);
|
assert(heap_bytes<heap_size);
|
||||||
}
|
}
|
||||||
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
|
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
|
||||||
|
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
#include <pwd.h>
|
#include <pwd.h>
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -170,17 +170,24 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
|
|||||||
std::vector<int> primes({2,3,5});
|
std::vector<int> primes({2,3,5});
|
||||||
|
|
||||||
int dim = 0;
|
int dim = 0;
|
||||||
|
int last_dim = ndimension - 1;
|
||||||
int AutoShmSize = 1;
|
int AutoShmSize = 1;
|
||||||
while(AutoShmSize != WorldShmSize) {
|
while(AutoShmSize != WorldShmSize) {
|
||||||
for(int p=0;p<primes.size();p++) {
|
int p;
|
||||||
|
for(p=0;p<primes.size();p++) {
|
||||||
int prime=primes[p];
|
int prime=primes[p];
|
||||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||||
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||||
AutoShmSize*=prime;
|
AutoShmSize*=prime;
|
||||||
ShmDims[dim]*=prime;
|
ShmDims[dim]*=prime;
|
||||||
|
last_dim = dim;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (p == primes.size() && last_dim == dim) {
|
||||||
|
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
dim=(dim+1) %ndimension;
|
dim=(dim+1) %ndimension;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -413,7 +420,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -433,13 +440,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
|
||||||
|
|
||||||
#ifdef GRID_IBM_SUMMIT
|
|
||||||
// IBM Jsrun makes cuda Device numbering screwy and not match rank
|
|
||||||
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
|
|
||||||
#else
|
|
||||||
std::cout << "setting device to WorldShmRank"<<std::endl;
|
|
||||||
cudaSetDevice(WorldShmRank);
|
|
||||||
#endif
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -677,7 +677,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
/////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////
|
||||||
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
cudaMemset(dest,0,bytes);
|
cudaMemset(dest,0,bytes);
|
||||||
#else
|
#else
|
||||||
bzero(dest,bytes);
|
bzero(dest,bytes);
|
||||||
@ -685,7 +685,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
}
|
}
|
||||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||||
#else
|
#else
|
||||||
bcopy(src,dest,bytes);
|
bcopy(src,dest,bytes);
|
||||||
|
@ -49,4 +49,29 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_COMMS_SHMEM
|
#ifdef GRID_COMMS_SHMEM
|
||||||
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
|
#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
template<typename Op, typename T1>
|
||||||
|
auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
|
||||||
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
|
||||||
|
{
|
||||||
|
return Cshift(closure(expr),dim,shift);
|
||||||
|
}
|
||||||
|
template <class Op, class T1, class T2>
|
||||||
|
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
|
||||||
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
|
||||||
|
{
|
||||||
|
return Cshift(closure(expr),dim,shift);
|
||||||
|
}
|
||||||
|
template <class Op, class T1, class T2, class T3>
|
||||||
|
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
|
||||||
|
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
||||||
|
eval(0, expr.arg2),
|
||||||
|
eval(0, expr.arg3)))>
|
||||||
|
{
|
||||||
|
return Cshift(closure(expr),dim,shift);
|
||||||
|
}
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -29,6 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
extern Vector<std::pair<int,int> > Cshift_table;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -46,16 +48,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int ent = 0;
|
int ent = 0;
|
||||||
|
|
||||||
static Vector<std::pair<int,int> > table; table.resize(e1*e2);
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int bo = n*e2;
|
int bo = n*e2;
|
||||||
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -65,15 +67,20 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
int o = n*stride;
|
int o = n*stride;
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
thread_for(i,ent,{
|
{
|
||||||
buffer[table[i].first]=rhs_v[table[i].second];
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
|
auto buffer_p = & buffer[0];
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
|
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there *is* need to SIMD split
|
// Gather for when there *is* need to SIMD split
|
||||||
@ -95,35 +102,37 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
thread_for_collapse(2,n,e1,{
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
for(int b=0;b<e2;b++){
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
|
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
|
|
||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
// Test_cshift_red_black code.
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl;
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
thread_for_collapse(2,n,e1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
|
Coordinate coor;
|
||||||
|
|
||||||
int o=n*n1;
|
int o=n*n1;
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
|
int oindex = o+b;
|
||||||
|
|
||||||
|
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||||
|
|
||||||
|
int ocb=1<<cb;
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
|
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -145,7 +154,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int stride=rhs.Grid()->_slice_stride[dimension];
|
int stride=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int ent =0;
|
int ent =0;
|
||||||
|
|
||||||
if ( cbmask ==0x3 ) {
|
if ( cbmask ==0x3 ) {
|
||||||
@ -154,7 +164,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int bo =n*rhs.Grid()->_slice_block[dimension];
|
int bo =n*rhs.Grid()->_slice_block[dimension];
|
||||||
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,17 +175,21 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
int o =n*rhs.Grid()->_slice_stride[dimension];
|
int o =n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
{
|
||||||
thread_for(i,ent,{
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
rhs_v[table[i].first]=buffer[table[i].second];
|
auto buffer_p = & buffer[0];
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
|
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there *is* need to SIMD split
|
// Scatter for when there *is* need to SIMD split
|
||||||
@ -194,13 +208,11 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||||
thread_for_collapse(2,n,e1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
}
|
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -208,7 +220,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
// Test_cshift_red_black code.
|
// Test_cshift_red_black code.
|
||||||
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
||||||
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
@ -225,6 +237,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// local to node block strided copies
|
// local to node block strided copies
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
@ -239,14 +252,16 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
|
||||||
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if(cbmask == 0x3 ){
|
if(cbmask == 0x3 ){
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -255,23 +270,24 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
int o =n*stride+b;
|
int o =n*stride+b;
|
||||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
{
|
||||||
auto lhs_v = lhs.View();
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
thread_for(i,ent,{
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
});
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||||
{
|
{
|
||||||
|
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
|
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
|
||||||
@ -285,30 +301,34 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
int e2=rhs.Grid()->_slice_block [dimension];
|
int e2=rhs.Grid()->_slice_block [dimension];
|
||||||
int stride = rhs.Grid()->_slice_stride[dimension];
|
int stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
|
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
|
||||||
|
|
||||||
int ent=0;
|
int ent=0;
|
||||||
|
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||||
}}
|
}}
|
||||||
} else {
|
} else {
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o =n*stride;
|
int o =n*stride;
|
||||||
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
|
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View();
|
{
|
||||||
auto lhs_v = lhs.View();
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
thread_for(i,ent,{
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
accelerator_for(i,ent,1,{
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Local to node Cshift
|
// Local to node Cshift
|
||||||
|
4
Grid/cshift/Cshift_table.cc
Normal file
4
Grid/cshift/Cshift_table.cc
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#include <Grid/GridCore.h>
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
Vector<std::pair<int,int> > Cshift_table;
|
||||||
|
NAMESPACE_END(Grid);
|
@ -26,6 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#pragma once
|
||||||
|
#include <Grid/lattice/Lattice_view.h>
|
||||||
#include <Grid/lattice/Lattice_base.h>
|
#include <Grid/lattice/Lattice_base.h>
|
||||||
#include <Grid/lattice/Lattice_conformable.h>
|
#include <Grid/lattice/Lattice_conformable.h>
|
||||||
#include <Grid/lattice/Lattice_ET.h>
|
#include <Grid/lattice/Lattice_ET.h>
|
||||||
@ -35,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice_local.h>
|
#include <Grid/lattice/Lattice_local.h>
|
||||||
#include <Grid/lattice/Lattice_reduction.h>
|
#include <Grid/lattice/Lattice_reduction.h>
|
||||||
#include <Grid/lattice/Lattice_peekpoke.h>
|
#include <Grid/lattice/Lattice_peekpoke.h>
|
||||||
#include <Grid/lattice/Lattice_reality.h>
|
//#include <Grid/lattice/Lattice_reality.h>
|
||||||
#include <Grid/lattice/Lattice_comparison_utils.h>
|
#include <Grid/lattice/Lattice_comparison_utils.h>
|
||||||
#include <Grid/lattice/Lattice_comparison.h>
|
#include <Grid/lattice/Lattice_comparison.h>
|
||||||
#include <Grid/lattice/Lattice_coordinate.h>
|
#include <Grid/lattice/Lattice_coordinate.h>
|
||||||
@ -43,4 +44,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice_rng.h>
|
#include <Grid/lattice/Lattice_rng.h>
|
||||||
#include <Grid/lattice/Lattice_unary.h>
|
#include <Grid/lattice/Lattice_unary.h>
|
||||||
#include <Grid/lattice/Lattice_transfer.h>
|
#include <Grid/lattice/Lattice_transfer.h>
|
||||||
|
#include <Grid/lattice/Lattice_basis.h>
|
||||||
|
@ -9,6 +9,7 @@ Copyright (C) 2015
|
|||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: neo <cossu@post.kek.jp>
|
Author: neo <cossu@post.kek.jp>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -91,12 +92,18 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
|
|||||||
{
|
{
|
||||||
return arg[ss];
|
return arg[ss];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// What needs this?
|
||||||
|
// Cannot be legal on accelerator
|
||||||
|
// Comparison must convert
|
||||||
|
#if 1
|
||||||
template <class lobj> accelerator_inline
|
template <class lobj> accelerator_inline
|
||||||
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
|
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
|
||||||
{
|
{
|
||||||
auto view = arg.View();
|
auto view = arg.View(AcceleratorRead);
|
||||||
return view[ss];
|
return view[ss];
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// handle nodes in syntax tree- eval one operand
|
// handle nodes in syntax tree- eval one operand
|
||||||
@ -179,16 +186,12 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
|
|||||||
cb = lat.Checkerboard();
|
cb = lat.Checkerboard();
|
||||||
}
|
}
|
||||||
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||||
inline void CBFromExpression(int &cb, const T1 ¬lat) // non-lattice leaf
|
inline void CBFromExpression(int &cb, const T1 ¬lat) {} // non-lattice leaf
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Op, typename T1> inline
|
template <typename Op, typename T1> inline
|
||||||
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
|
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
{
|
{
|
||||||
CBFromExpression(cb, expr.arg1); // recurse AST
|
CBFromExpression(cb, expr.arg1); // recurse AST
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Op, typename T1, typename T2> inline
|
template <typename Op, typename T1, typename T2> inline
|
||||||
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
|
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
{
|
{
|
||||||
@ -203,6 +206,68 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
|
|||||||
CBFromExpression(cb, expr.arg3); // recurse AST
|
CBFromExpression(cb, expr.arg3); // recurse AST
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// ViewOpen
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
||||||
|
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
|
||||||
|
{
|
||||||
|
lat.ViewOpen(AcceleratorRead);
|
||||||
|
}
|
||||||
|
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||||
|
inline void ExpressionViewOpen(T1 ¬lat) {}
|
||||||
|
|
||||||
|
template <typename Op, typename T1> inline
|
||||||
|
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
|
||||||
|
{
|
||||||
|
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Op, typename T1, typename T2> inline
|
||||||
|
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
|
{
|
||||||
|
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||||
|
ExpressionViewOpen(expr.arg2); // recurse AST
|
||||||
|
}
|
||||||
|
template <typename Op, typename T1, typename T2, typename T3>
|
||||||
|
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
|
{
|
||||||
|
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||||
|
ExpressionViewOpen(expr.arg2); // recurse AST
|
||||||
|
ExpressionViewOpen(expr.arg3); // recurse AST
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// ViewClose
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
|
||||||
|
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
|
||||||
|
{
|
||||||
|
lat.ViewClose();
|
||||||
|
}
|
||||||
|
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
|
||||||
|
inline void ExpressionViewClose(T1 ¬lat) {}
|
||||||
|
|
||||||
|
template <typename Op, typename T1> inline
|
||||||
|
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
|
||||||
|
{
|
||||||
|
ExpressionViewClose(expr.arg1); // recurse AST
|
||||||
|
}
|
||||||
|
template <typename Op, typename T1, typename T2> inline
|
||||||
|
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
|
{
|
||||||
|
ExpressionViewClose(expr.arg1); // recurse AST
|
||||||
|
ExpressionViewClose(expr.arg2); // recurse AST
|
||||||
|
}
|
||||||
|
template <typename Op, typename T1, typename T2, typename T3>
|
||||||
|
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
|
{
|
||||||
|
ExpressionViewClose(expr.arg1); // recurse AST
|
||||||
|
ExpressionViewClose(expr.arg2); // recurse AST
|
||||||
|
ExpressionViewClose(expr.arg3); // recurse AST
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Unary operators and funcs
|
// Unary operators and funcs
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
@ -55,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
mult(&tmp,&lhs_v(ss),&rhs);
|
mult(&tmp,&lhs_v(ss),&rhs);
|
||||||
@ -120,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View();
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -178,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View();
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View();
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View();
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -220,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
|
|||||||
ret.Checkerboard() = x.Checkerboard();
|
ret.Checkerboard() = x.Checkerboard();
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto x_v = x.View();
|
autoView( x_v , x, AcceleratorRead);
|
||||||
auto y_v = y.View();
|
autoView( y_v , y, AcceleratorRead);
|
||||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
auto tmp = a*x_v(ss)+y_v(ss);
|
auto tmp = a*x_v(ss)+y_v(ss);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
|||||||
ret.Checkerboard() = x.Checkerboard();
|
ret.Checkerboard() = x.Checkerboard();
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto x_v = x.View();
|
autoView( x_v , x, AcceleratorRead);
|
||||||
auto y_v = y.View();
|
autoView( y_v , y, AcceleratorRead);
|
||||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
|
@ -9,6 +9,7 @@ Copyright (C) 2015
|
|||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -28,6 +29,7 @@ See the full license in the file "LICENSE" in the top level distribution
|
|||||||
directory
|
directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#define STREAMING_STORES
|
#define STREAMING_STORES
|
||||||
@ -36,129 +38,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
extern int GridCshiftPermuteMap[4][16];
|
extern int GridCshiftPermuteMap[4][16];
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
// Base class which can be used by traits to pick up behaviour
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
class LatticeBase {};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Conformable checks; same instance of Grid required
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
|
|
||||||
{
|
|
||||||
assert(lhs == rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Minimal base class containing only data valid to access from accelerator
|
|
||||||
// _odata will be a managed pointer in CUDA
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Force access to lattice through a view object.
|
|
||||||
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
|
|
||||||
// strict since host could could in principle direct access through the lattice object
|
|
||||||
// Need to decide programming model.
|
|
||||||
#define LATTICE_VIEW_STRICT
|
|
||||||
template<class vobj> class LatticeAccelerator : public LatticeBase
|
|
||||||
{
|
|
||||||
protected:
|
|
||||||
GridBase *_grid;
|
|
||||||
int checkerboard;
|
|
||||||
vobj *_odata; // A managed pointer
|
|
||||||
uint64_t _odata_size;
|
|
||||||
public:
|
|
||||||
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
|
|
||||||
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
|
|
||||||
accelerator_inline int Checkerboard(void) const { return checkerboard; };
|
|
||||||
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
|
|
||||||
accelerator_inline void Conformable(GridBase * &grid) const
|
|
||||||
{
|
|
||||||
if (grid) conformable(grid, _grid);
|
|
||||||
else grid = _grid;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// A View class which provides accessor to the data.
|
|
||||||
// This will be safe to call from accelerator_for and is trivially copy constructible
|
|
||||||
// The copy constructor for this will need to be used by device lambda functions
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<class vobj>
|
|
||||||
class LatticeView : public LatticeAccelerator<vobj>
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
|
|
||||||
// Rvalue
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
|
|
||||||
#else
|
|
||||||
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
|
||||||
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
|
||||||
|
|
||||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
|
||||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
|
||||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
|
||||||
|
|
||||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Lattice expression types used by ET to assemble the AST
|
|
||||||
//
|
|
||||||
// Need to be able to detect code paths according to the whether a lattice object or not
|
|
||||||
// so introduce some trait type things
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
class LatticeExpressionBase {};
|
|
||||||
|
|
||||||
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
|
||||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
|
||||||
|
|
||||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
|
||||||
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
|
||||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
|
||||||
|
|
||||||
template <typename Op, typename _T1>
|
|
||||||
class LatticeUnaryExpression : public LatticeExpressionBase
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
typedef typename ViewMap<_T1>::Type T1;
|
|
||||||
Op op;
|
|
||||||
T1 arg1;
|
|
||||||
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Op, typename _T1, typename _T2>
|
|
||||||
class LatticeBinaryExpression : public LatticeExpressionBase
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
typedef typename ViewMap<_T1>::Type T1;
|
|
||||||
typedef typename ViewMap<_T2>::Type T2;
|
|
||||||
Op op;
|
|
||||||
T1 arg1;
|
|
||||||
T2 arg2;
|
|
||||||
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename Op, typename _T1, typename _T2, typename _T3>
|
|
||||||
class LatticeTrinaryExpression : public LatticeExpressionBase
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
typedef typename ViewMap<_T1>::Type T1;
|
|
||||||
typedef typename ViewMap<_T2>::Type T2;
|
|
||||||
typedef typename ViewMap<_T3>::Type T3;
|
|
||||||
Op op;
|
|
||||||
T1 arg1;
|
|
||||||
T2 arg2;
|
|
||||||
T3 arg3;
|
|
||||||
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// The real lattice class, with normal copy and assignment semantics.
|
// The real lattice class, with normal copy and assignment semantics.
|
||||||
// This contains extra (host resident) grid pointer data that may be accessed by host code
|
// This contains extra (host resident) grid pointer data that may be accessed by host code
|
||||||
@ -201,14 +80,23 @@ private:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void SetViewMode(ViewMode mode) {
|
||||||
|
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
||||||
|
accessor.ViewClose();
|
||||||
|
}
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
// Return a view object that may be dereferenced in site loops.
|
// Return a view object that may be dereferenced in site loops.
|
||||||
// The view is trivially copy constructible and may be copied to an accelerator device
|
// The view is trivially copy constructible and may be copied to an accelerator device
|
||||||
// in device lambdas
|
// in device lambdas
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
LatticeView<vobj> View (void) const
|
|
||||||
|
LatticeView<vobj> View (ViewMode mode) const
|
||||||
{
|
{
|
||||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
|
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
||||||
return accessor;
|
return accessor;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -232,11 +120,15 @@ public:
|
|||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
this->checkerboard=cb;
|
this->checkerboard=cb;
|
||||||
|
|
||||||
auto me = View();
|
auto exprCopy = expr;
|
||||||
|
ExpressionViewOpen(exprCopy);
|
||||||
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
accelerator_for(ss,me.size(),1,{
|
accelerator_for(ss,me.size(),1,{
|
||||||
auto tmp = eval(ss,expr);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
vstream(me[ss],tmp);
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
|
ExpressionViewClose(exprCopy);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
|
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
|
||||||
@ -251,11 +143,15 @@ public:
|
|||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
this->checkerboard=cb;
|
this->checkerboard=cb;
|
||||||
|
|
||||||
auto me = View();
|
auto exprCopy = expr;
|
||||||
|
ExpressionViewOpen(exprCopy);
|
||||||
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
accelerator_for(ss,me.size(),1,{
|
accelerator_for(ss,me.size(),1,{
|
||||||
auto tmp = eval(ss,expr);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
vstream(me[ss],tmp);
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
|
ExpressionViewClose(exprCopy);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
|
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
|
||||||
@ -269,11 +165,15 @@ public:
|
|||||||
CBFromExpression(cb,expr);
|
CBFromExpression(cb,expr);
|
||||||
assert( (cb==Odd) || (cb==Even));
|
assert( (cb==Odd) || (cb==Even));
|
||||||
this->checkerboard=cb;
|
this->checkerboard=cb;
|
||||||
auto me = View();
|
auto exprCopy = expr;
|
||||||
|
ExpressionViewOpen(exprCopy);
|
||||||
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
accelerator_for(ss,me.size(),1,{
|
accelerator_for(ss,me.size(),1,{
|
||||||
auto tmp = eval(ss,expr);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
vstream(me[ss],tmp);
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
|
ExpressionViewClose(exprCopy);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
//GridFromExpression is tricky to do
|
//GridFromExpression is tricky to do
|
||||||
@ -324,10 +224,11 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
auto me = View();
|
auto me = View(CpuWrite);
|
||||||
thread_for(ss,me.size(),{
|
thread_for(ss,me.size(),{
|
||||||
me[ss]= r;
|
me[ss]= r;
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -337,11 +238,12 @@ public:
|
|||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// user defined constructor
|
// user defined constructor
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
Lattice(GridBase *grid) {
|
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
|
||||||
this->_grid = grid;
|
this->_grid = grid;
|
||||||
resize(this->_grid->oSites());
|
resize(this->_grid->oSites());
|
||||||
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
|
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
|
||||||
this->checkerboard=0;
|
this->checkerboard=0;
|
||||||
|
SetViewMode(mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
// virtual ~Lattice(void) = default;
|
// virtual ~Lattice(void) = default;
|
||||||
@ -357,7 +259,6 @@ public:
|
|||||||
// copy constructor
|
// copy constructor
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
Lattice(const Lattice& r){
|
Lattice(const Lattice& r){
|
||||||
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
|
|
||||||
this->_grid = r.Grid();
|
this->_grid = r.Grid();
|
||||||
resize(this->_grid->oSites());
|
resize(this->_grid->oSites());
|
||||||
*this = r;
|
*this = r;
|
||||||
@ -380,11 +281,12 @@ public:
|
|||||||
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
||||||
conformable(*this,r);
|
conformable(*this,r);
|
||||||
this->checkerboard = r.Checkerboard();
|
this->checkerboard = r.Checkerboard();
|
||||||
auto me = View();
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
auto him= r.View();
|
auto him= r.View(AcceleratorRead);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(me[ss],him(ss));
|
coalescedWrite(me[ss],him(ss));
|
||||||
});
|
});
|
||||||
|
me.ViewClose(); him.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -394,11 +296,12 @@ public:
|
|||||||
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
||||||
this->checkerboard = r.Checkerboard();
|
this->checkerboard = r.Checkerboard();
|
||||||
conformable(*this,r);
|
conformable(*this,r);
|
||||||
auto me = View();
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
auto him= r.View();
|
auto him= r.View(AcceleratorRead);
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(me[ss],him(ss));
|
coalescedWrite(me[ss],him(ss));
|
||||||
});
|
});
|
||||||
|
me.ViewClose(); him.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
|
226
Grid/lattice/Lattice_basis.h
Normal file
226
Grid/lattice/Lattice_basis.h
Normal file
@ -0,0 +1,226 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/lattice/Lattice_basis.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
|
||||||
|
{
|
||||||
|
// If assume basis[j] are already orthonormal,
|
||||||
|
// can take all inner products in parallel saving 2x bandwidth
|
||||||
|
// Save 3x bandwidth on the second line of loop.
|
||||||
|
// perhaps 2.5x speed up.
|
||||||
|
// 2x overall in Multigrid Lanczos
|
||||||
|
for(int j=0; j<k; ++j){
|
||||||
|
auto ip = innerProduct(basis[j],w);
|
||||||
|
w = w - ip*basis[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class VField, class Matrix>
|
||||||
|
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||||
|
{
|
||||||
|
typedef decltype(basis[0]) Field;
|
||||||
|
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||||
|
|
||||||
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
|
for(int k=0;k<basis.size();k++){
|
||||||
|
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
View *basis_vp = &basis_v[0];
|
||||||
|
|
||||||
|
int nrot = j1-j0;
|
||||||
|
if (!nrot) // edge case not handled gracefully by Cuda
|
||||||
|
return;
|
||||||
|
|
||||||
|
uint64_t oSites =grid->oSites();
|
||||||
|
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||||
|
|
||||||
|
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||||
|
|
||||||
|
Vector <vobj> Bt(siteBlock * nrot);
|
||||||
|
auto Bp=&Bt[0];
|
||||||
|
|
||||||
|
// GPU readable copy of matrix
|
||||||
|
Vector<double> Qt_jv(Nm*Nm);
|
||||||
|
double *Qt_p = & Qt_jv[0];
|
||||||
|
thread_for(i,Nm*Nm,{
|
||||||
|
int j = i/Nm;
|
||||||
|
int k = i%Nm;
|
||||||
|
Qt_p[i]=Qt(j,k);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Block the loop to keep storage footprint down
|
||||||
|
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
||||||
|
|
||||||
|
// remaining work in this block
|
||||||
|
int ssites=MIN(siteBlock,oSites-s);
|
||||||
|
|
||||||
|
// zero out the accumulators
|
||||||
|
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
|
||||||
|
decltype(coalescedRead(Bp[ss])) z;
|
||||||
|
z=Zero();
|
||||||
|
coalescedWrite(Bp[ss],z);
|
||||||
|
});
|
||||||
|
|
||||||
|
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
|
||||||
|
|
||||||
|
int j =sj%nrot;
|
||||||
|
int jj =j0+j;
|
||||||
|
int ss =sj/nrot;
|
||||||
|
int sss=ss+s;
|
||||||
|
|
||||||
|
for(int k=k0; k<k1; ++k){
|
||||||
|
auto tmp = coalescedRead(Bp[ss*nrot+j]);
|
||||||
|
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
|
||||||
|
int j =sj%nrot;
|
||||||
|
int jj =j0+j;
|
||||||
|
int ss =sj/nrot;
|
||||||
|
int sss=ss+s;
|
||||||
|
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract a single rotated vector
|
||||||
|
template<class Field>
|
||||||
|
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
||||||
|
{
|
||||||
|
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||||
|
typedef typename Field::vector_object vobj;
|
||||||
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
|
result.Checkerboard() = basis[0].Checkerboard();
|
||||||
|
|
||||||
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
|
for(int k=0;k<basis.size();k++){
|
||||||
|
basis_v.push_back(basis[k].View(AcceleratorRead));
|
||||||
|
}
|
||||||
|
vobj zz=Zero();
|
||||||
|
Vector<double> Qt_jv(Nm);
|
||||||
|
double * Qt_j = & Qt_jv[0];
|
||||||
|
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||||
|
|
||||||
|
autoView(result_v,result,AcceleratorWrite);
|
||||||
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
|
auto B=coalescedRead(zz);
|
||||||
|
for(int k=k0; k<k1; ++k){
|
||||||
|
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
|
||||||
|
}
|
||||||
|
coalescedWrite(result_v[ss], B);
|
||||||
|
});
|
||||||
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
|
||||||
|
{
|
||||||
|
int vlen = idx.size();
|
||||||
|
|
||||||
|
assert(vlen>=1);
|
||||||
|
assert(vlen<=sort_vals.size());
|
||||||
|
assert(vlen<=_v.size());
|
||||||
|
|
||||||
|
for (size_t i=0;i<vlen;i++) {
|
||||||
|
|
||||||
|
if (idx[i] != i) {
|
||||||
|
|
||||||
|
//////////////////////////////////////
|
||||||
|
// idx[i] is a table of desired sources giving a permutation.
|
||||||
|
// Swap v[i] with v[idx[i]].
|
||||||
|
// Find j>i for which _vnew[j] = _vold[i],
|
||||||
|
// track the move idx[j] => idx[i]
|
||||||
|
// track the move idx[i] => i
|
||||||
|
//////////////////////////////////////
|
||||||
|
size_t j;
|
||||||
|
for (j=i;j<idx.size();j++)
|
||||||
|
if (idx[j]==i)
|
||||||
|
break;
|
||||||
|
|
||||||
|
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
|
||||||
|
|
||||||
|
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
|
||||||
|
std::swap(sort_vals[i],sort_vals[idx[i]]);
|
||||||
|
|
||||||
|
idx[j] = idx[i];
|
||||||
|
idx[i] = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
|
||||||
|
{
|
||||||
|
std::vector<int> idx(sort_vals.size());
|
||||||
|
std::iota(idx.begin(), idx.end(), 0);
|
||||||
|
|
||||||
|
// sort indexes based on comparing values in v
|
||||||
|
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
|
||||||
|
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
|
||||||
|
});
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
|
||||||
|
{
|
||||||
|
std::vector<int> idx = basisSortGetIndex(sort_vals);
|
||||||
|
if (reverse)
|
||||||
|
std::reverse(idx.begin(), idx.end());
|
||||||
|
|
||||||
|
basisReorderInPlace(_v,sort_vals,idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// PAB: faster to compute the inner products first then fuse loops.
|
||||||
|
// If performance critical can improve.
|
||||||
|
template<class Field>
|
||||||
|
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
|
||||||
|
result = Zero();
|
||||||
|
assert(_v.size()==eval.size());
|
||||||
|
int N = (int)_v.size();
|
||||||
|
for (int i=0;i<N;i++) {
|
||||||
|
Field& tmp = _v[i];
|
||||||
|
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(rhs.Grid());
|
Lattice<vPredicate> ret(rhs.Grid());
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v, ret, CpuWrite);
|
||||||
thread_for( ss, rhs_v.size(), {
|
thread_for( ss, rhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
|
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
|
||||||
});
|
});
|
||||||
@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(lhs.Grid());
|
Lattice<vPredicate> ret(lhs.Grid());
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v, ret, CpuWrite);
|
||||||
thread_for( ss, lhs_v.size(), {
|
thread_for( ss, lhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs_v[ss],rhs);
|
ret_v[ss]=op(lhs_v[ss],rhs);
|
||||||
});
|
});
|
||||||
@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(rhs.Grid());
|
Lattice<vPredicate> ret(rhs.Grid());
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v, ret, CpuWrite);
|
||||||
thread_for( ss, rhs_v.size(), {
|
thread_for( ss, rhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs,rhs_v[ss]);
|
ret_v[ss]=op(lhs,rhs_v[ss]);
|
||||||
});
|
});
|
||||||
|
@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
|||||||
GridBase *grid = l.Grid();
|
GridBase *grid = l.Grid();
|
||||||
int Nsimd = grid->iSites();
|
int Nsimd = grid->iSites();
|
||||||
|
|
||||||
auto l_v = l.View();
|
autoView(l_v, l, CpuWrite);
|
||||||
thread_for( o, grid->oSites(), {
|
thread_for( o, grid->oSites(), {
|
||||||
vector_type vI;
|
vector_type vI;
|
||||||
Coordinate gcoor;
|
Coordinate gcoor;
|
||||||
@ -51,23 +51,5 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
// LatticeCoordinate();
|
|
||||||
// FIXME for debug; deprecate this; made obscelete by
|
|
||||||
template<class vobj> void lex_sites(Lattice<vobj> &l){
|
|
||||||
auto l_v = l.View();
|
|
||||||
Real *v_ptr = (Real *)&l_v[0];
|
|
||||||
size_t o_len = l.Grid()->oSites();
|
|
||||||
size_t v_len = sizeof(vobj)/sizeof(vRealF);
|
|
||||||
size_t vec_len = vRealF::Nsimd();
|
|
||||||
|
|
||||||
for(int i=0;i<o_len;i++){
|
|
||||||
for(int j=0;j<v_len;j++){
|
|
||||||
for(int vv=0;vv<vec_len;vv+=2){
|
|
||||||
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
|
|
||||||
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
|
|
||||||
}
|
|
||||||
}}
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -43,8 +43,8 @@ template<class vobj>
|
|||||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
|
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -56,9 +56,9 @@ template<class vobj>
|
|||||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
|
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
|
|||||||
typedef decltype(coalescedRead(ll())) sll;
|
typedef decltype(coalescedRead(ll())) sll;
|
||||||
typedef decltype(coalescedRead(rr())) srr;
|
typedef decltype(coalescedRead(rr())) srr;
|
||||||
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
|
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
accelerator_for(ss,rhs_v.size(),1,{
|
accelerator_for(ss,rhs_v.size(),1,{
|
||||||
// FIXME had issues with scalar version of outer
|
// FIXME had issues with scalar version of outer
|
||||||
// Use vector [] operator and don't read coalesce this loop
|
// Use vector [] operator and don't read coalesce this loop
|
||||||
|
@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
|||||||
int block =FullGrid->_slice_block [Orthog];
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
auto X_v = X.View();
|
autoView( X_v , X, CpuRead);
|
||||||
auto Y_v = Y.View();
|
autoView( Y_v , Y, CpuRead);
|
||||||
auto R_v = R.View();
|
autoView( R_v , R, CpuWrite);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> s_x(Nblock);
|
std::vector<vobj> s_x(Nblock);
|
||||||
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
|||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
auto X_v = X.View();
|
autoView( X_v , X, CpuRead);
|
||||||
auto R_v = R.View();
|
autoView( R_v , R, CpuWrite);
|
||||||
|
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
|||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_typeD;
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, CpuRead);
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v , rhs, CpuRead);
|
||||||
thread_region {
|
thread_region {
|
||||||
std::vector<vobj> Left(Nblock);
|
std::vector<vobj> Left(Nblock);
|
||||||
std::vector<vobj> Right(Nblock);
|
std::vector<vobj> Right(Nblock);
|
||||||
|
@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
|
|||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
|
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
thread_for( ss, lhs_v.size(), {
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
|
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
|||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
|
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
thread_for( ss, lhs_v.size(), {
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
|
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
|||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
|
||||||
{
|
{
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
thread_for( ss, lhs_v.size(), {
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
|
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
|
||||||
{
|
{
|
||||||
auto rhs_v = rhs.View();
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
thread_for( ss, lhs_v.size(), {
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
|
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
|
|||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
auto l_v = l.View();
|
autoView( l_v , l, CpuWrite);
|
||||||
if ( rank == grid->ThisRank() ) {
|
if ( rank == grid->ThisRank() ) {
|
||||||
extract(l_v[odx],buf);
|
extract(l_v[odx],buf);
|
||||||
buf[idx] = s;
|
buf[idx] = s;
|
||||||
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
|||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
auto l_v = l.View();
|
autoView( l_v , l, CpuWrite);
|
||||||
extract(l_v[odx],buf);
|
extract(l_v[odx],buf);
|
||||||
|
|
||||||
s = buf[idx];
|
s = buf[idx];
|
||||||
@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
|||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Peek a scalar object from the SIMD array
|
// Peek a scalar object from the SIMD array
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
|
// Must be CPU read view
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
|
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
||||||
|
{
|
||||||
GridBase *grid = l.Grid();
|
GridBase *grid = l.getGrid();
|
||||||
|
assert(l.mode==CpuRead);
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
|
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
@ -173,8 +173,7 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
|
|||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
auto l_v = l.View();
|
scalar_type * vp = (scalar_type *)&l[odx];
|
||||||
scalar_type * vp = (scalar_type *)&l_v[odx];
|
|
||||||
scalar_type * pt = (scalar_type *)&s;
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
|
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
@ -183,18 +182,19 @@ accelerator_inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate
|
|||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
// Must be CPU write view
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
|
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
||||||
|
{
|
||||||
GridBase *grid=l.Grid();
|
GridBase *grid=l.getGrid();
|
||||||
|
assert(l.mode==CpuWrite);
|
||||||
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
|
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
@ -202,13 +202,11 @@ accelerator_inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate
|
|||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
auto l_v = l.View();
|
scalar_type * vp = (scalar_type *)&l[odx];
|
||||||
scalar_type * vp = (scalar_type *)&l_v[odx];
|
|
||||||
scalar_type * pt = (scalar_type *)&s;
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
vp[idx+w*Nsimd] = pt[w];
|
vp[idx+w*Nsimd] = pt[w];
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -40,8 +40,11 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
auto lhs_v = lhs.View();
|
|
||||||
auto ret_v = ret.View();
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -50,8 +53,11 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
auto lhs_v = lhs.View();
|
|
||||||
auto ret_v = ret.View();
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
|
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
@ -24,7 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/Grid_Eigen_Dense.h>
|
#include <Grid/Grid_Eigen_Dense.h>
|
||||||
|
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -38,7 +39,36 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
const int Nsimd = vobj::Nsimd();
|
// const int Nsimd = vobj::Nsimd();
|
||||||
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
|
Vector<sobj> sumarray(nthread);
|
||||||
|
for(int i=0;i<nthread;i++){
|
||||||
|
sumarray[i]=Zero();
|
||||||
|
}
|
||||||
|
|
||||||
|
thread_for(thr,nthread, {
|
||||||
|
int nwork, mywork, myoff;
|
||||||
|
nwork = osites;
|
||||||
|
GridThread::GetWork(nwork,thr,mywork,myoff);
|
||||||
|
vobj vvsum=Zero();
|
||||||
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
|
vvsum = vvsum + arg[ss];
|
||||||
|
}
|
||||||
|
sumarray[thr]=Reduce(vvsum);
|
||||||
|
});
|
||||||
|
|
||||||
|
sobj ssum=Zero(); // sum across threads
|
||||||
|
for(int i=0;i<nthread;i++){
|
||||||
|
ssum = ssum+sumarray[i];
|
||||||
|
}
|
||||||
|
return ssum;
|
||||||
|
}
|
||||||
|
template<class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_objectD sobj;
|
||||||
|
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
Vector<sobj> sumarray(nthread);
|
Vector<sobj> sumarray(nthread);
|
||||||
@ -62,23 +92,43 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
ssum = ssum+sumarray[i];
|
ssum = ssum+sumarray[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
return ssum;
|
typedef typename vobj::scalar_object ssobj;
|
||||||
|
ssobj ret = ssum;
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
#ifdef GRID_NVCC
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
return sum_gpu(arg,osites);
|
return sum_gpu(arg,osites);
|
||||||
#else
|
#else
|
||||||
return sum_cpu(arg,osites);
|
return sum_cpu(arg,osites);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
template<class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
||||||
|
{
|
||||||
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
|
return sumD_gpu(arg,osites);
|
||||||
|
#else
|
||||||
|
return sumD_cpu(arg,osites);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
auto arg_v = arg.View();
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
|
autoView( arg_v, arg, AcceleratorRead);
|
||||||
Integer osites = arg.Grid()->oSites();
|
Integer osites = arg.Grid()->oSites();
|
||||||
auto ssum= sum(&arg_v[0],osites);
|
auto ssum= sum_gpu(&arg_v[0],osites);
|
||||||
|
#else
|
||||||
|
autoView(arg_v, arg, CpuRead);
|
||||||
|
Integer osites = arg.Grid()->oSites();
|
||||||
|
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||||
|
#endif
|
||||||
arg.Grid()->GlobalSum(ssum);
|
arg.Grid()->GlobalSum(ssum);
|
||||||
return ssum;
|
return ssum;
|
||||||
}
|
}
|
||||||
@ -93,7 +143,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
|||||||
|
|
||||||
// Double inner product
|
// Double inner product
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
@ -101,47 +151,41 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
// Might make all code paths go this way.
|
|
||||||
auto left_v = left.View();
|
|
||||||
auto right_v=right.View();
|
|
||||||
|
|
||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
// Might make all code paths go this way.
|
||||||
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
{
|
||||||
|
autoView( left_v , left, AcceleratorRead);
|
||||||
|
autoView( right_v,right, AcceleratorRead);
|
||||||
|
|
||||||
// GPU - SIMT lane compliance...
|
// GPU - SIMT lane compliance...
|
||||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
accelerator_for( ss, sites, 1,{
|
||||||
Vector<inner_t> inner_tmp(sites);
|
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
|
||||||
|
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
|
||||||
auto x_l = left_v(ss);
|
|
||||||
auto y_l = right_v(ss);
|
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
|
||||||
})
|
|
||||||
|
|
||||||
// This is in single precision and fails some tests
|
|
||||||
// Need a sumD that sums in double
|
|
||||||
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
|
|
||||||
#else
|
|
||||||
// CPU
|
|
||||||
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
|
|
||||||
Vector<inner_t> inner_tmp(sites);
|
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
|
||||||
auto x_l = left_v[ss];
|
auto x_l = left_v[ss];
|
||||||
auto y_l = right_v[ss];
|
auto y_l = right_v[ss];
|
||||||
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
||||||
})
|
});
|
||||||
nrm = TensorRemove(sum(inner_tmp_v,sites));
|
}
|
||||||
#endif
|
|
||||||
grid->GlobalSum(nrm);
|
|
||||||
|
|
||||||
|
// This is in single precision and fails some tests
|
||||||
|
auto anrm = sum(inner_tmp_v,sites);
|
||||||
|
nrm = anrm;
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj>
|
||||||
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||||
|
GridBase *grid = left.Grid();
|
||||||
|
ComplexD nrm = rankInnerProduct(left,right);
|
||||||
|
grid->GlobalSum(nrm);
|
||||||
|
return nrm;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////
|
/////////////////////////
|
||||||
// Fast axpby_norm
|
// Fast axpby_norm
|
||||||
// z = a x + b y
|
// z = a x + b y
|
||||||
@ -167,44 +211,66 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
|
|
||||||
GridBase *grid = x.Grid();
|
GridBase *grid = x.Grid();
|
||||||
|
|
||||||
auto x_v=x.View();
|
|
||||||
auto y_v=y.View();
|
|
||||||
auto z_v=z.View();
|
|
||||||
|
|
||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
|
||||||
// GPU
|
// GPU
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
autoView( x_v, x, AcceleratorRead);
|
||||||
Vector<inner_t> inner_tmp(sites);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
|
||||||
coalescedWrite(z_v[ss],tmp);
|
|
||||||
});
|
|
||||||
|
|
||||||
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
|
|
||||||
#else
|
|
||||||
// CPU
|
|
||||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, 1,{
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
auto tmp = a*x_v[ss]+b*y_v[ss];
|
||||||
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
||||||
z_v[ss]=tmp;
|
z_v[ss]=tmp;
|
||||||
});
|
});
|
||||||
// Already promoted to double
|
|
||||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
#endif
|
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj> strong_inline void
|
||||||
|
innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
|
||||||
|
{
|
||||||
|
conformable(left,right);
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
|
Vector<ComplexD> tmp(2);
|
||||||
|
|
||||||
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
|
// GPU
|
||||||
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
|
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
Vector<norm_t> norm_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
auto norm_tmp_v = &norm_tmp[0];
|
||||||
|
{
|
||||||
|
autoView(left_v,left, AcceleratorRead);
|
||||||
|
autoView(right_v,right,AcceleratorRead);
|
||||||
|
accelerator_for( ss, sites, 1,{
|
||||||
|
auto left_tmp = left_v[ss];
|
||||||
|
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
|
||||||
|
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
|
||||||
|
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
|
||||||
|
|
||||||
|
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
|
||||||
|
ip = tmp[0];
|
||||||
|
nrm = real(tmp[1]);
|
||||||
|
}
|
||||||
|
|
||||||
template<class Op,class T1>
|
template<class Op,class T1>
|
||||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||||
@ -271,7 +337,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
|
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
// Parallel over orthog direction
|
// Parallel over orthog direction
|
||||||
auto Data_v=Data.View();
|
autoView( Data_v, Data, CpuRead);
|
||||||
thread_for( r,rd, {
|
thread_for( r,rd, {
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
@ -349,8 +415,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
int stride=grid->_slice_stride[orthogdim];
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
|
||||||
auto lhv=lhs.View();
|
autoView( lhv, lhs, CpuRead);
|
||||||
auto rhv=rhs.View();
|
autoView( rhv, rhs, CpuRead);
|
||||||
thread_for( r,rd,{
|
thread_for( r,rd,{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
@ -457,14 +523,12 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
|
|
||||||
tensor_reduced at; at=av;
|
tensor_reduced at; at=av;
|
||||||
|
|
||||||
auto Rv=R.View();
|
autoView( Rv, R, CpuWrite);
|
||||||
auto Xv=X.View();
|
autoView( Xv, X, CpuRead);
|
||||||
auto Yv=Y.View();
|
autoView( Yv, Y, CpuRead);
|
||||||
thread_for_collapse(2, n, e1, {
|
thread_for2d( n, e1, b,e2, {
|
||||||
for(int b=0;b<e2;b++){
|
|
||||||
int ss= so+n*stride+b;
|
int ss= so+n*stride+b;
|
||||||
Rv[ss] = at*Xv[ss]+Yv[ss];
|
Rv[ss] = at*Xv[ss]+Yv[ss];
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -517,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
|||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
auto X_v=X.View();
|
autoView( X_v, X, CpuRead);
|
||||||
auto Y_v=Y.View();
|
autoView( Y_v, Y, CpuRead);
|
||||||
auto R_v=R.View();
|
autoView( R_v, R, CpuWrite);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
Vector<vobj> s_x(Nblock);
|
Vector<vobj> s_x(Nblock);
|
||||||
@ -564,13 +628,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
|||||||
// int nl=1;
|
// int nl=1;
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
//FIXME package in a convenient iterator
|
||||||
|
// thread_for2d_in_region
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
int stride=FullGrid->_slice_stride[Orthog];
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
int block =FullGrid->_slice_block [Orthog];
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
auto R_v = R.View();
|
autoView( R_v, R, CpuWrite);
|
||||||
auto X_v = X.View();
|
autoView( X_v, X, CpuRead);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> s_x(Nblock);
|
std::vector<vobj> s_x(Nblock);
|
||||||
@ -628,8 +693,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
|||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_typeD;
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
|
|
||||||
auto lhs_v=lhs.View();
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
auto rhs_v=rhs.View();
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> Left(Nblock);
|
std::vector<vobj> Left(Nblock);
|
||||||
|
@ -1,7 +1,13 @@
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#ifdef GRID_HIP
|
||||||
|
extern hipDeviceProp_t *gpu_props;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
extern cudaDeviceProp *gpu_props;
|
extern cudaDeviceProp *gpu_props;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define WARP_SIZE 32
|
||||||
__device__ unsigned int retirementCount = 0;
|
__device__ unsigned int retirementCount = 0;
|
||||||
|
|
||||||
template <class Iterator>
|
template <class Iterator>
|
||||||
@ -19,7 +25,12 @@ template <class Iterator>
|
|||||||
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
|
||||||
|
|
||||||
int device;
|
int device;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
cudaGetDevice(&device);
|
cudaGetDevice(&device);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipGetDevice(&device);
|
||||||
|
#endif
|
||||||
|
|
||||||
Iterator warpSize = gpu_props[device].warpSize;
|
Iterator warpSize = gpu_props[device].warpSize;
|
||||||
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
|
||||||
@ -147,7 +158,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
|||||||
sobj *smem = (sobj *)shmem_pointer;
|
sobj *smem = (sobj *)shmem_pointer;
|
||||||
|
|
||||||
// wait until all outstanding memory instructions in this thread are finished
|
// wait until all outstanding memory instructions in this thread are finished
|
||||||
__threadfence();
|
acceleratorFence();
|
||||||
|
|
||||||
if (tid==0) {
|
if (tid==0) {
|
||||||
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
|
||||||
@ -156,7 +167,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// each thread must read the correct value of amLast
|
// each thread must read the correct value of amLast
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
|
|
||||||
if (amLast) {
|
if (amLast) {
|
||||||
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
// reduce buffer[0], ..., buffer[gridDim.x-1]
|
||||||
@ -199,13 +210,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
|||||||
sobj *buffer_v = &buffer[0];
|
sobj *buffer_v = &buffer[0];
|
||||||
|
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||||
cudaDeviceSynchronize();
|
accelerator_barrier();
|
||||||
|
|
||||||
cudaError err = cudaGetLastError();
|
|
||||||
if ( cudaSuccess != err ) {
|
|
||||||
printf("Cuda error %s\n",cudaGetErrorString( err ));
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
auto result = buffer_v[0];
|
auto result = buffer_v[0];
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -375,7 +375,7 @@ public:
|
|||||||
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
||||||
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
||||||
|
|
||||||
auto l_v = l.View();
|
autoView(l_v, l, CpuWrite);
|
||||||
thread_for( ss, osites, {
|
thread_for( ss, osites, {
|
||||||
ExtractBuffer<scalar_object> buf(Nsimd);
|
ExtractBuffer<scalar_object> buf(Nsimd);
|
||||||
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
||||||
@ -462,7 +462,7 @@ public:
|
|||||||
|
|
||||||
{
|
{
|
||||||
// Obtain one reseeded generator per thread
|
// Obtain one reseeded generator per thread
|
||||||
int Nthread = GridThread::GetThreads();
|
int Nthread = 32; // Hardwire a good level or parallelism
|
||||||
std::vector<RngEngine> seeders(Nthread);
|
std::vector<RngEngine> seeders(Nthread);
|
||||||
for(int t=0;t<Nthread;t++){
|
for(int t=0;t<Nthread;t++){
|
||||||
seeders[t] = Reseed(master_engine);
|
seeders[t] = Reseed(master_engine);
|
||||||
|
@ -37,17 +37,19 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Trace
|
// Trace
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
/*
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
|
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View();
|
autoView(ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView(lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Trace Index level dependent operation
|
// Trace Index level dependent operation
|
||||||
@ -56,8 +58,8 @@ template<int Index,class vobj>
|
|||||||
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
|
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -46,11 +47,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// remove and insert a half checkerboard
|
// remove and insert a half checkerboard
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
||||||
|
{
|
||||||
half.Checkerboard() = cb;
|
half.Checkerboard() = cb;
|
||||||
|
|
||||||
auto half_v = half.View();
|
autoView( half_v, half, CpuWrite);
|
||||||
auto full_v = full.View();
|
autoView( full_v, full, CpuRead);
|
||||||
thread_for(ss, full.Grid()->oSites(),{
|
thread_for(ss, full.Grid()->oSites(),{
|
||||||
int cbos;
|
int cbos;
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -63,10 +65,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
||||||
|
{
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
auto half_v = half.View();
|
autoView( half_v , half, CpuRead);
|
||||||
auto full_v = full.View();
|
autoView( full_v , full, CpuWrite);
|
||||||
thread_for(ss,full.Grid()->oSites(),{
|
thread_for(ss,full.Grid()->oSites(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -82,94 +85,136 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex,int nbasis>
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Flexible Type Conversion for internal promotion to double as well as graceful
|
||||||
|
// treatment of scalar-compatible types
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
|
||||||
|
out = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
|
||||||
|
out = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef GRID_SIMT
|
||||||
|
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
|
||||||
|
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
|
||||||
|
}
|
||||||
|
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
|
||||||
|
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in;
|
||||||
|
}
|
||||||
|
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
|
||||||
|
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
|
||||||
|
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
|
||||||
|
}
|
||||||
|
|
||||||
|
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
|
||||||
|
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
|
||||||
|
|
||||||
|
template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
|
||||||
|
accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
|
||||||
|
convertType(out,in._internal);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2>
|
||||||
|
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
|
||||||
|
convertType(out._internal,in);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
|
||||||
|
for (int i=0;i<N;i++)
|
||||||
|
for (int j=0;j<N;j++)
|
||||||
|
convertType(out._internal[i][j],in._internal[i][j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2,int N>
|
||||||
|
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
|
||||||
|
for (int i=0;i<N;i++)
|
||||||
|
convertType(out._internal[i],in._internal[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
|
||||||
|
accelerator_inline void convertType(T & out, const T & in) {
|
||||||
|
out = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T1,typename T2>
|
||||||
|
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
||||||
|
autoView( out_v , out,AcceleratorWrite);
|
||||||
|
autoView( in_v , in ,AcceleratorRead);
|
||||||
|
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
|
||||||
|
convertType(out_v[ss],in_v(ss));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// precision-promoted local inner product
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj>
|
||||||
|
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
|
||||||
|
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>>
|
||||||
|
{
|
||||||
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
|
|
||||||
|
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
|
||||||
|
Lattice<iScalar<t_inner>> ret(lhs.Grid());
|
||||||
|
|
||||||
|
{
|
||||||
|
autoView(ret_v, ret,AcceleratorWrite);
|
||||||
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
|
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// block routines
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
const Lattice<vobj> &fineData,
|
const Lattice<vobj> &fineData,
|
||||||
const std::vector<Lattice<vobj> > &Basis)
|
const VLattice &Basis)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineData.Grid();
|
GridBase * fine = fineData.Grid();
|
||||||
GridBase * coarse= coarseData.Grid();
|
GridBase * coarse= coarseData.Grid();
|
||||||
|
|
||||||
Lattice<CComplex> ip(coarse);
|
Lattice<iScalar<CComplex>> ip(coarse);
|
||||||
|
Lattice<vobj> fineDataRed = fineData;
|
||||||
|
|
||||||
// auto fineData_ = fineData.View();
|
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||||
auto coarseData_ = coarseData.View();
|
autoView( ip_ , ip, AcceleratorWrite);
|
||||||
auto ip_ = ip.View();
|
|
||||||
for(int v=0;v<nbasis;v++) {
|
for(int v=0;v<nbasis;v++) {
|
||||||
blockInnerProduct(ip,Basis[v],fineData);
|
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
|
||||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||||
coalescedWrite(coarseData_[sc](v),ip_(sc));
|
convertType(coarseData_[sc](v),ip_[sc]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// improve numerical stability of projection
|
||||||
|
// |fine> = |fine> - <basis|fine> |basis>
|
||||||
|
ip=-ip;
|
||||||
|
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex,int nbasis>
|
|
||||||
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|
||||||
const Lattice<vobj> &fineData,
|
|
||||||
const std::vector<Lattice<vobj> > &Basis)
|
|
||||||
{
|
|
||||||
typedef iVector<CComplex,nbasis > coarseSiteData;
|
|
||||||
coarseSiteData elide;
|
|
||||||
typedef decltype(coalescedRead(elide)) ScalarComplex;
|
|
||||||
GridBase * fine = fineData.Grid();
|
|
||||||
GridBase * coarse= coarseData.Grid();
|
|
||||||
int _ndimension = coarse->_ndimension;
|
|
||||||
|
|
||||||
// checks
|
template<class vobj,class vobj2,class CComplex>
|
||||||
assert( nbasis == Basis.size() );
|
|
||||||
subdivides(coarse,fine);
|
|
||||||
for(int i=0;i<nbasis;i++){
|
|
||||||
conformable(Basis[i],fineData);
|
|
||||||
}
|
|
||||||
|
|
||||||
Coordinate block_r (_ndimension);
|
|
||||||
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
|
||||||
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
|
|
||||||
}
|
|
||||||
int blockVol = fine->oSites()/coarse->oSites();
|
|
||||||
|
|
||||||
coarseData=Zero();
|
|
||||||
|
|
||||||
auto fineData_ = fineData.View();
|
|
||||||
auto coarseData_ = coarseData.View();
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
|
|
||||||
// Otherwise do fine inner product per site, and make the update atomic
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
|
|
||||||
|
|
||||||
auto sc=sci/nbasis;
|
|
||||||
auto i=sci%nbasis;
|
|
||||||
auto Basis_ = Basis[i].View();
|
|
||||||
|
|
||||||
Coordinate coor_c(_ndimension);
|
|
||||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
|
||||||
|
|
||||||
int sf;
|
|
||||||
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
|
|
||||||
|
|
||||||
for(int sb=0;sb<blockVol;sb++){
|
|
||||||
|
|
||||||
Coordinate coor_b(_ndimension);
|
|
||||||
Coordinate coor_f(_ndimension);
|
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
|
|
||||||
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
|
|
||||||
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
|
||||||
|
|
||||||
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
|
|
||||||
}
|
|
||||||
coalescedWrite(coarseData_[sc](i),reduce);
|
|
||||||
});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
|
||||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||||
const Lattice<CComplex> &coarseA,
|
const Lattice<CComplex> &coarseA,
|
||||||
const Lattice<vobj> &fineX,
|
const Lattice<vobj2> &fineX,
|
||||||
const Lattice<vobj> &fineY)
|
const Lattice<vobj> &fineY)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineZ.Grid();
|
GridBase * fine = fineZ.Grid();
|
||||||
@ -191,10 +236,10 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto fineZ_ = fineZ.View();
|
autoView( fineZ_ , fineZ, AcceleratorWrite);
|
||||||
auto fineX_ = fineX.View();
|
autoView( fineX_ , fineX, AcceleratorRead);
|
||||||
auto fineY_ = fineY.View();
|
autoView( fineY_ , fineY, AcceleratorRead);
|
||||||
auto coarseA_= coarseA.View();
|
autoView( coarseA_, coarseA, AcceleratorRead);
|
||||||
|
|
||||||
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
||||||
|
|
||||||
@ -207,13 +252,50 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
// z = A x + y
|
// z = A x + y
|
||||||
coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf));
|
#ifdef GRID_SIMT
|
||||||
|
typename vobj2::tensor_reduced::scalar_object cA;
|
||||||
|
typename vobj::scalar_object cAx;
|
||||||
|
#else
|
||||||
|
typename vobj2::tensor_reduced cA;
|
||||||
|
vobj cAx;
|
||||||
|
#endif
|
||||||
|
convertType(cA,TensorRemove(coarseA_(sc)));
|
||||||
|
auto prod = cA*fineX_(sf);
|
||||||
|
convertType(cAx,prod);
|
||||||
|
coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class CComplex>
|
||||||
|
inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
|
||||||
|
const Lattice<vobj> &fineX,
|
||||||
|
const Lattice<vobj> &fineY)
|
||||||
|
{
|
||||||
|
typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
|
||||||
|
|
||||||
|
GridBase *coarse(CoarseInner.Grid());
|
||||||
|
GridBase *fine (fineX.Grid());
|
||||||
|
|
||||||
|
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
|
||||||
|
Lattice<dotp> coarse_inner(coarse);
|
||||||
|
|
||||||
|
// Precision promotion
|
||||||
|
fine_inner = localInnerProductD<vobj>(fineX,fineY);
|
||||||
|
blockSum(coarse_inner,fine_inner);
|
||||||
|
{
|
||||||
|
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
|
||||||
|
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
|
||||||
|
accelerator_for(ss, coarse->oSites(), 1, {
|
||||||
|
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj,class CComplex> // deprecate
|
||||||
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
||||||
const Lattice<vobj> &fineX,
|
const Lattice<vobj> &fineX,
|
||||||
const Lattice<vobj> &fineY)
|
const Lattice<vobj> &fineY)
|
||||||
@ -227,15 +309,17 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
Lattice<dotp> coarse_inner(coarse);
|
Lattice<dotp> coarse_inner(coarse);
|
||||||
|
|
||||||
// Precision promotion?
|
// Precision promotion?
|
||||||
auto CoarseInner_ = CoarseInner.View();
|
|
||||||
auto coarse_inner_ = coarse_inner.View();
|
|
||||||
|
|
||||||
fine_inner = localInnerProduct(fineX,fineY);
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
blockSum(coarse_inner,fine_inner);
|
blockSum(coarse_inner,fine_inner);
|
||||||
|
{
|
||||||
|
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
|
||||||
|
autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
|
||||||
accelerator_for(ss, coarse->oSites(), 1, {
|
accelerator_for(ss, coarse->oSites(), 1, {
|
||||||
CoarseInner_[ss] = coarse_inner_[ss];
|
CoarseInner_[ss] = coarse_inner_[ss];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class CComplex>
|
||||||
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
|
||||||
{
|
{
|
||||||
@ -266,8 +350,8 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
|
|
||||||
// Turn this around to loop threaded over sc and interior loop
|
// Turn this around to loop threaded over sc and interior loop
|
||||||
// over sf would thread better
|
// over sf would thread better
|
||||||
auto coarseData_ = coarseData.View();
|
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||||
auto fineData_ = fineData.View();
|
autoView( fineData_ , fineData, AcceleratorRead);
|
||||||
|
|
||||||
accelerator_for(sc,coarse->oSites(),1,{
|
accelerator_for(sc,coarse->oSites(),1,{
|
||||||
|
|
||||||
@ -292,6 +376,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
|
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
|
||||||
{
|
{
|
||||||
@ -313,8 +398,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class CComplex,class VLattice>
|
||||||
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis)
|
inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
|
||||||
{
|
{
|
||||||
GridBase *coarse = ip.Grid();
|
GridBase *coarse = ip.Grid();
|
||||||
GridBase *fine = Basis[0].Grid();
|
GridBase *fine = Basis[0].Grid();
|
||||||
@ -330,15 +415,22 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
|
|||||||
for(int v=0;v<nbasis;v++) {
|
for(int v=0;v<nbasis;v++) {
|
||||||
for(int u=0;u<v;u++) {
|
for(int u=0;u<v;u++) {
|
||||||
//Inner product & remove component
|
//Inner product & remove component
|
||||||
blockInnerProduct(ip,Basis[u],Basis[v]);
|
blockInnerProductD(ip,Basis[u],Basis[v]);
|
||||||
ip = -ip;
|
ip = -ip;
|
||||||
blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]);
|
blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
|
||||||
}
|
}
|
||||||
blockNormalise(ip,Basis[v]);
|
blockNormalise(ip,Basis[v]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj,class CComplex>
|
||||||
|
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
|
||||||
|
{
|
||||||
|
blockOrthonormalize(ip,Basis);
|
||||||
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
// TODO: CPU optimized version here
|
||||||
template<class vobj,class CComplex,int nbasis>
|
template<class vobj,class CComplex,int nbasis>
|
||||||
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
Lattice<vobj> &fineData,
|
Lattice<vobj> &fineData,
|
||||||
@ -360,8 +452,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
}
|
}
|
||||||
auto fineData_ = fineData.View();
|
autoView( fineData_ , fineData, AcceleratorWrite);
|
||||||
auto coarseData_ = coarseData.View();
|
autoView( coarseData_ , coarseData, AcceleratorRead);
|
||||||
|
|
||||||
// Loop with a cache friendly loop ordering
|
// Loop with a cache friendly loop ordering
|
||||||
accelerator_for(sf,fine->oSites(),1,{
|
accelerator_for(sf,fine->oSites(),1,{
|
||||||
@ -374,7 +466,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
auto basis_ = Basis[i].View();
|
/* auto basis_ = Basis[i], );*/
|
||||||
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
||||||
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
||||||
}
|
}
|
||||||
@ -383,24 +475,25 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
|
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
template<class vobj,class CComplex,int nbasis>
|
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||||
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||||
Lattice<vobj> &fineData,
|
Lattice<vobj> &fineData,
|
||||||
const std::vector<Lattice<vobj> > &Basis)
|
const VLattice &Basis)
|
||||||
{
|
{
|
||||||
GridBase * fine = fineData.Grid();
|
GridBase * fine = fineData.Grid();
|
||||||
GridBase * coarse= coarseData.Grid();
|
GridBase * coarse= coarseData.Grid();
|
||||||
|
|
||||||
fineData=Zero();
|
fineData=Zero();
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
||||||
Lattice<CComplex> cip(coarse);
|
|
||||||
auto cip_ = cip.View();
|
//Lattice<CComplex> cip(coarse);
|
||||||
auto ip_ = ip.View();
|
//autoView( cip_ , cip, AcceleratorWrite);
|
||||||
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
//autoView( ip_ , ip, AcceleratorRead);
|
||||||
coalescedWrite(cip_[sc], ip_(sc)());
|
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
||||||
});
|
// coalescedWrite(cip_[sc], ip_(sc)());
|
||||||
blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
|
// });
|
||||||
|
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
|
||||||
|
blockZAXPY(fineData,ip,Basis[i],fineData);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -427,15 +520,17 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
|||||||
assert(ig->lSites() == og->lSites());
|
assert(ig->lSites() == og->lSites());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
autoView(in_v,in,CpuRead);
|
||||||
|
autoView(out_v,out,CpuWrite);
|
||||||
thread_for(idx, ig->lSites(),{
|
thread_for(idx, ig->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
ssobj ss;
|
ssobj ss;
|
||||||
|
|
||||||
Coordinate lcoor(ni);
|
Coordinate lcoor(ni);
|
||||||
ig->LocalIndexToLocalCoor(idx,lcoor);
|
ig->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
peekLocalSite(s,in,lcoor);
|
peekLocalSite(s,in_v,lcoor);
|
||||||
ss=s;
|
ss=s;
|
||||||
pokeLocalSite(ss,out,lcoor);
|
pokeLocalSite(ss,out_v,lcoor);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -470,8 +565,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
Coordinate rdt = Tg->_rdimensions;
|
Coordinate rdt = Tg->_rdimensions;
|
||||||
Coordinate ist = Tg->_istride;
|
Coordinate ist = Tg->_istride;
|
||||||
Coordinate ost = Tg->_ostride;
|
Coordinate ost = Tg->_ostride;
|
||||||
auto t_v = To.View();
|
|
||||||
auto f_v = From.View();
|
autoView( t_v , To, AcceleratorWrite);
|
||||||
|
autoView( f_v , From, AcceleratorRead);
|
||||||
accelerator_for(idx,Fg->lSites(),1,{
|
accelerator_for(idx,Fg->lSites(),1,{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate Fcoor(nd);
|
Coordinate Fcoor(nd);
|
||||||
@ -494,8 +590,6 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
|
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
|
||||||
}
|
}
|
||||||
// peekLocalSite(s,From,Fcoor);
|
|
||||||
// pokeLocalSite(s,To ,Tcoor);
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -526,6 +620,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
|
autoView(lowDimv,lowDim,CpuRead);
|
||||||
|
autoView(higherDimv,higherDim,CpuWrite);
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -538,8 +634,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
hcoor[d]=lcoor[ddl++];
|
hcoor[d]=lcoor[ddl++];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
peekLocalSite(s,lowDim,lcoor);
|
peekLocalSite(s,lowDimv,lcoor);
|
||||||
pokeLocalSite(s,higherDim,hcoor);
|
pokeLocalSite(s,higherDimv,hcoor);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -567,6 +663,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
|
autoView(lowDimv,lowDim,CpuWrite);
|
||||||
|
autoView(higherDimv,higherDim,CpuRead);
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -579,8 +677,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
hcoor[d]=lcoor[ddl++];
|
hcoor[d]=lcoor[ddl++];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
peekLocalSite(s,higherDim,hcoor);
|
peekLocalSite(s,higherDimv,hcoor);
|
||||||
pokeLocalSite(s,lowDim,lcoor);
|
pokeLocalSite(s,lowDimv,lcoor);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -608,6 +706,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
|
autoView(lowDimv,lowDim,CpuRead);
|
||||||
|
autoView(higherDimv,higherDim,CpuWrite);
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -616,8 +716,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
if( lcoor[orthog] == slice_lo ) {
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
hcoor=lcoor;
|
hcoor=lcoor;
|
||||||
hcoor[orthog] = slice_hi;
|
hcoor[orthog] = slice_hi;
|
||||||
peekLocalSite(s,lowDim,lcoor);
|
peekLocalSite(s,lowDimv,lcoor);
|
||||||
pokeLocalSite(s,higherDim,hcoor);
|
pokeLocalSite(s,higherDimv,hcoor);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -645,6 +745,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
|
autoView(lowDimv,lowDim,CpuWrite);
|
||||||
|
autoView(higherDimv,higherDim,CpuRead);
|
||||||
thread_for(idx,lg->lSites(),{
|
thread_for(idx,lg->lSites(),{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
@ -653,8 +755,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
|
|||||||
if( lcoor[orthog] == slice_lo ) {
|
if( lcoor[orthog] == slice_lo ) {
|
||||||
hcoor=lcoor;
|
hcoor=lcoor;
|
||||||
hcoor[orthog] = slice_hi;
|
hcoor[orthog] = slice_hi;
|
||||||
peekLocalSite(s,higherDim,hcoor);
|
peekLocalSite(s,higherDimv,hcoor);
|
||||||
pokeLocalSite(s,lowDim,lcoor);
|
pokeLocalSite(s,lowDimv,lcoor);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -718,7 +820,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
}
|
}
|
||||||
|
|
||||||
//loop over outer index
|
//loop over outer index
|
||||||
auto in_v = in.View();
|
autoView( in_v , in, CpuRead);
|
||||||
thread_for(in_oidx,in_grid->oSites(),{
|
thread_for(in_oidx,in_grid->oSites(),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
||||||
@ -811,7 +913,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
icoor[lane].resize(ndim);
|
icoor[lane].resize(ndim);
|
||||||
grid->iCoorFromIindex(icoor[lane],lane);
|
grid->iCoorFromIindex(icoor[lane],lane);
|
||||||
}
|
}
|
||||||
auto out_v = out.View();
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for(oidx, grid->oSites(),{
|
thread_for(oidx, grid->oSites(),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
ExtractPointerArray<sobj> ptrs(nsimd);
|
ExtractPointerArray<sobj> ptrs(nsimd);
|
||||||
@ -914,7 +1016,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
|||||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
||||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||||
|
|
||||||
auto out_v = out.View();
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for(out_oidx,out_grid->oSites(),{
|
thread_for(out_oidx,out_grid->oSites(),{
|
||||||
Coordinate out_ocoor(ndim);
|
Coordinate out_ocoor(ndim);
|
||||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||||
|
@ -38,16 +38,18 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Transpose
|
// Transpose
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
/*
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Index level dependent transpose
|
// Index level dependent transpose
|
||||||
@ -56,8 +58,8 @@ template<int Index,class vobj>
|
|||||||
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
|
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View();
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View();
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
|
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto rhs = rhs_i.View();
|
autoView( rhs, rhs_i, AcceleratorRead);
|
||||||
auto ret = ret_i.View();
|
autoView( ret, ret_i, AcceleratorWrite);
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),1,{
|
accelerator_for(ss,rhs.size(),1,{
|
||||||
ret[ss]=pow(rhs[ss],y);
|
ret[ss]=pow(rhs[ss],y);
|
||||||
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
|||||||
}
|
}
|
||||||
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto rhs = rhs_i.View();
|
autoView( rhs , rhs_i, AcceleratorRead);
|
||||||
auto ret = ret_i.View();
|
autoView( ret , ret_i, AcceleratorWrite);
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],mod(rhs(ss),y));
|
coalescedWrite(ret[ss],mod(rhs(ss),y));
|
||||||
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto ret = ret_i.View();
|
autoView( ret , ret_i, AcceleratorWrite);
|
||||||
auto rhs = rhs_i.View();
|
autoView( rhs , rhs_i, AcceleratorRead);
|
||||||
ret.Checkerboard() = rhs_i.Checkerboard();
|
ret.Checkerboard() = rhs_i.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],div(rhs(ss),y));
|
coalescedWrite(ret[ss],div(rhs(ss),y));
|
||||||
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
|
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto rhs = rhs_i.View();
|
autoView( rhs , rhs_i, AcceleratorRead);
|
||||||
auto ret = ret_i.View();
|
autoView( ret , ret_i, AcceleratorWrite);
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
|
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
|
||||||
|
168
Grid/lattice/Lattice_view.h
Normal file
168
Grid/lattice/Lattice_view.h
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
#pragma once
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
// Base class which can be used by traits to pick up behaviour
|
||||||
|
///////////////////////////////////////////////////////////////////
|
||||||
|
class LatticeBase {};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Conformable checks; same instance of Grid required
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
|
||||||
|
{
|
||||||
|
assert(lhs == rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Minimal base class containing only data valid to access from accelerator
|
||||||
|
// _odata will be a managed pointer in CUDA
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Force access to lattice through a view object.
|
||||||
|
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
|
||||||
|
// strict since host could could in principle direct access through the lattice object
|
||||||
|
// Need to decide programming model.
|
||||||
|
#define LATTICE_VIEW_STRICT
|
||||||
|
template<class vobj> class LatticeAccelerator : public LatticeBase
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
//public:
|
||||||
|
GridBase *_grid;
|
||||||
|
int checkerboard;
|
||||||
|
vobj *_odata; // A managed pointer
|
||||||
|
uint64_t _odata_size;
|
||||||
|
ViewAdvise advise;
|
||||||
|
public:
|
||||||
|
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
|
||||||
|
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
|
||||||
|
accelerator_inline int Checkerboard(void) const { return checkerboard; };
|
||||||
|
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
|
||||||
|
accelerator_inline ViewAdvise Advise(void) const { return advise; };
|
||||||
|
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
|
||||||
|
accelerator_inline void Conformable(GridBase * &grid) const
|
||||||
|
{
|
||||||
|
if (grid) conformable(grid, _grid);
|
||||||
|
else grid = _grid;
|
||||||
|
};
|
||||||
|
// Host only
|
||||||
|
GridBase * getGrid(void) const { return _grid; };
|
||||||
|
};
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// A View class which provides accessor to the data.
|
||||||
|
// This will be safe to call from accelerator_for and is trivially copy constructible
|
||||||
|
// The copy constructor for this will need to be used by device lambda functions
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class vobj>
|
||||||
|
class LatticeView : public LatticeAccelerator<vobj>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
// Rvalue
|
||||||
|
ViewMode mode;
|
||||||
|
void * cpu_ptr;
|
||||||
|
#ifdef GRID_SIMT
|
||||||
|
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
|
||||||
|
return coalescedRead(this->_odata[i]);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
|
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||||
|
|
||||||
|
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||||
|
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||||
|
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||||
|
|
||||||
|
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
|
||||||
|
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
|
||||||
|
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
|
||||||
|
{
|
||||||
|
this->ViewOpen(mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Host functions
|
||||||
|
void ViewOpen(ViewMode mode)
|
||||||
|
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
|
||||||
|
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
|
||||||
|
this->cpu_ptr = (void *)this->_odata;
|
||||||
|
this->mode = mode;
|
||||||
|
this->_odata =(vobj *)
|
||||||
|
MemoryManager::ViewOpen(this->cpu_ptr,
|
||||||
|
this->_odata_size*sizeof(vobj),
|
||||||
|
mode,
|
||||||
|
this->advise);
|
||||||
|
}
|
||||||
|
void ViewClose(void)
|
||||||
|
{ // Inform the manager
|
||||||
|
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
|
||||||
|
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
// Little autoscope assister
|
||||||
|
template<class View>
|
||||||
|
class ViewCloser
|
||||||
|
{
|
||||||
|
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||||
|
public:
|
||||||
|
ViewCloser(View &_v) : v(_v) {};
|
||||||
|
~ViewCloser() { v.ViewClose(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
#define autoView(l_v,l,mode) \
|
||||||
|
auto l_v = l.View(mode); \
|
||||||
|
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lattice expression types used by ET to assemble the AST
|
||||||
|
//
|
||||||
|
// Need to be able to detect code paths according to the whether a lattice object or not
|
||||||
|
// so introduce some trait type things
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
class LatticeExpressionBase {};
|
||||||
|
|
||||||
|
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
||||||
|
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
||||||
|
|
||||||
|
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
||||||
|
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
||||||
|
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
||||||
|
|
||||||
|
template <typename Op, typename _T1>
|
||||||
|
class LatticeUnaryExpression : public LatticeExpressionBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef typename ViewMap<_T1>::Type T1;
|
||||||
|
Op op;
|
||||||
|
T1 arg1;
|
||||||
|
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Op, typename _T1, typename _T2>
|
||||||
|
class LatticeBinaryExpression : public LatticeExpressionBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef typename ViewMap<_T1>::Type T1;
|
||||||
|
typedef typename ViewMap<_T2>::Type T2;
|
||||||
|
Op op;
|
||||||
|
T1 arg1;
|
||||||
|
T2 arg2;
|
||||||
|
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Op, typename _T1, typename _T2, typename _T3>
|
||||||
|
class LatticeTrinaryExpression : public LatticeExpressionBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef typename ViewMap<_T1>::Type T1;
|
||||||
|
typedef typename ViewMap<_T2>::Type T2;
|
||||||
|
typedef typename ViewMap<_T3>::Type T3;
|
||||||
|
Op op;
|
||||||
|
T1 arg1;
|
||||||
|
T2 arg2;
|
||||||
|
T3 arg3;
|
||||||
|
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
|
||||||
|
};
|
||||||
|
NAMESPACE_END(Grid);
|
@ -341,7 +341,7 @@ class BinaryIO {
|
|||||||
int ieee32big = (format == std::string("IEEE32BIG"));
|
int ieee32big = (format == std::string("IEEE32BIG"));
|
||||||
int ieee32 = (format == std::string("IEEE32"));
|
int ieee32 = (format == std::string("IEEE32"));
|
||||||
int ieee64big = (format == std::string("IEEE64BIG"));
|
int ieee64big = (format == std::string("IEEE64BIG"));
|
||||||
int ieee64 = (format == std::string("IEEE64"));
|
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
|
||||||
assert(ieee64||ieee32|ieee64big||ieee32big);
|
assert(ieee64||ieee32|ieee64big||ieee32big);
|
||||||
assert((ieee64+ieee32+ieee64big+ieee32big)==1);
|
assert((ieee64+ieee32+ieee64big+ieee32big)==1);
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class fobj,class sobj>
|
||||||
|
struct GaugeDoubleStoredMunger{
|
||||||
|
void operator()(fobj &in, sobj &out) {
|
||||||
|
for (int mu = 0; mu < Nds; mu++) {
|
||||||
|
for (int i = 0; i < Nc; i++) {
|
||||||
|
for (int j = 0; j < Nc; j++) {
|
||||||
|
out(mu)()(i, j) = in(mu)()(i, j);
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class fobj, class sobj>
|
||||||
|
struct GaugeDoubleStoredUnmunger {
|
||||||
|
void operator()(sobj &in, fobj &out) {
|
||||||
|
for (int mu = 0; mu < Nds; mu++) {
|
||||||
|
for (int i = 0; i < Nc; i++) {
|
||||||
|
for (int j = 0; j < Nc; j++) {
|
||||||
|
out(mu)()(i, j) = in(mu)()(i, j);
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
template<class fobj,class sobj>
|
template<class fobj,class sobj>
|
||||||
struct Gauge3x2munger{
|
struct Gauge3x2munger{
|
||||||
void operator() (fobj &in,sobj &out){
|
void operator() (fobj &in,sobj &out){
|
||||||
|
@ -146,7 +146,7 @@ public:
|
|||||||
int ieee32big = (format == std::string("IEEE32BIG"));
|
int ieee32big = (format == std::string("IEEE32BIG"));
|
||||||
int ieee32 = (format == std::string("IEEE32"));
|
int ieee32 = (format == std::string("IEEE32"));
|
||||||
int ieee64big = (format == std::string("IEEE64BIG"));
|
int ieee64big = (format == std::string("IEEE64BIG"));
|
||||||
int ieee64 = (format == std::string("IEEE64"));
|
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
|
||||||
|
|
||||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
// depending on datatype, set up munger;
|
// depending on datatype, set up munger;
|
||||||
|
224
Grid/parallelIO/OpenQcdIO.h
Normal file
224
Grid/parallelIO/OpenQcdIO.h
Normal file
@ -0,0 +1,224 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/parallelIO/OpenQcdIO.h
|
||||||
|
|
||||||
|
Copyright (C) 2015 - 2020
|
||||||
|
|
||||||
|
Author: Daniel Richtmann <daniel.richtmann@ur.de>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
struct OpenQcdHeader : Serializable {
|
||||||
|
GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
|
||||||
|
int, Nt,
|
||||||
|
int, Nx,
|
||||||
|
int, Ny,
|
||||||
|
int, Nz,
|
||||||
|
double, plaq);
|
||||||
|
};
|
||||||
|
|
||||||
|
class OpenQcdIO : public BinaryIO {
|
||||||
|
public:
|
||||||
|
static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
|
||||||
|
|
||||||
|
static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
|
||||||
|
OpenQcdHeader header;
|
||||||
|
|
||||||
|
{
|
||||||
|
std::ifstream fin(file, std::ios::in | std::ios::binary);
|
||||||
|
fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
|
||||||
|
assert(!fin.fail());
|
||||||
|
field.data_start = fin.tellg();
|
||||||
|
fin.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
header.plaq /= normalisationFactor;
|
||||||
|
|
||||||
|
// sanity check (should trigger on endian issues)
|
||||||
|
assert(0 < header.Nt && header.Nt <= 1024);
|
||||||
|
assert(0 < header.Nx && header.Nx <= 1024);
|
||||||
|
assert(0 < header.Ny && header.Ny <= 1024);
|
||||||
|
assert(0 < header.Nz && header.Nz <= 1024);
|
||||||
|
|
||||||
|
field.dimension[0] = header.Nx;
|
||||||
|
field.dimension[1] = header.Ny;
|
||||||
|
field.dimension[2] = header.Nz;
|
||||||
|
field.dimension[3] = header.Nt;
|
||||||
|
|
||||||
|
std::cout << GridLogDebug << "header: " << header << std::endl;
|
||||||
|
std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
|
||||||
|
std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
|
||||||
|
|
||||||
|
assert(grid->_ndimension == Nd);
|
||||||
|
for(int d = 0; d < Nd; d++)
|
||||||
|
assert(grid->_fdimensions[d] == field.dimension[d]);
|
||||||
|
|
||||||
|
field.plaquette = header.plaq;
|
||||||
|
|
||||||
|
return field.data_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vsimd>
|
||||||
|
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
|
||||||
|
FieldMetaData& header,
|
||||||
|
std::string file) {
|
||||||
|
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
|
||||||
|
|
||||||
|
assert(Ns == 4 and Nd == 4 and Nc == 3);
|
||||||
|
|
||||||
|
auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
|
||||||
|
assert(grid != nullptr); assert(grid->_ndimension == Nd);
|
||||||
|
|
||||||
|
uint64_t offset = readHeader(file, Umu.Grid(), header);
|
||||||
|
|
||||||
|
FieldMetaData clone(header);
|
||||||
|
|
||||||
|
std::string format("IEEE64"); // they always store little endian double precsision
|
||||||
|
uint32_t nersc_csum, scidac_csuma, scidac_csumb;
|
||||||
|
|
||||||
|
GridCartesian* grid_openqcd = createOpenQcdGrid(grid);
|
||||||
|
GridRedBlackCartesian* grid_rb = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
|
||||||
|
|
||||||
|
typedef DoubleStoredColourMatrixD fobj;
|
||||||
|
typedef typename DoubleStoredGaugeField::vector_object::scalar_object sobj;
|
||||||
|
typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
|
||||||
|
|
||||||
|
word w = 0;
|
||||||
|
|
||||||
|
std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
|
||||||
|
std::vector<sobj> scalardata(grid->lSites());
|
||||||
|
|
||||||
|
IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
|
||||||
|
nersc_csum, scidac_csuma, scidac_csumb);
|
||||||
|
|
||||||
|
GridStopWatch timer;
|
||||||
|
timer.Start();
|
||||||
|
|
||||||
|
DoubleStoredGaugeField Umu_ds(grid);
|
||||||
|
|
||||||
|
auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
|
||||||
|
|
||||||
|
Coordinate ldim = grid->LocalDimensions();
|
||||||
|
thread_for(idx_g, grid->lSites(), {
|
||||||
|
Coordinate coor;
|
||||||
|
grid->LocalIndexToLocalCoor(idx_g, coor);
|
||||||
|
|
||||||
|
bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
|
||||||
|
|
||||||
|
if(!isOdd) continue;
|
||||||
|
|
||||||
|
int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
|
||||||
|
+ coor[Xdir] * ldim[Ydir] * ldim[Zdir]
|
||||||
|
+ coor[Ydir] * ldim[Zdir]
|
||||||
|
+ coor[Zdir])/2;
|
||||||
|
|
||||||
|
munge(iodata[idx_o], scalardata[idx_g]);
|
||||||
|
});
|
||||||
|
|
||||||
|
grid->Barrier(); timer.Stop();
|
||||||
|
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
|
||||||
|
|
||||||
|
timer.Reset(); timer.Start();
|
||||||
|
|
||||||
|
vectorizeFromLexOrdArray(scalardata, Umu_ds);
|
||||||
|
|
||||||
|
grid->Barrier(); timer.Stop();
|
||||||
|
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
|
||||||
|
|
||||||
|
timer.Reset(); timer.Start();
|
||||||
|
|
||||||
|
undoDoubleStore(Umu, Umu_ds);
|
||||||
|
|
||||||
|
grid->Barrier(); timer.Stop();
|
||||||
|
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
|
||||||
|
|
||||||
|
GaugeStatistics(Umu, clone);
|
||||||
|
|
||||||
|
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
std::cout << GridLogMessage << "OpenQcd Configuration " << file
|
||||||
|
<< " plaquette " << clone.plaquette
|
||||||
|
<< " header " << header.plaquette
|
||||||
|
<< " difference " << plaq_diff
|
||||||
|
<< std::endl;
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
|
||||||
|
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
|
||||||
|
|
||||||
|
if(plaq_diff >= tol)
|
||||||
|
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
|
||||||
|
assert(plaq_diff < tol);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vsimd>
|
||||||
|
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
|
||||||
|
std::string file) {
|
||||||
|
std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
|
||||||
|
// exploit GridCartesian to be able to still use IOobject
|
||||||
|
Coordinate gdim = grid->GlobalDimensions();
|
||||||
|
Coordinate ldim = grid->LocalDimensions();
|
||||||
|
Coordinate pcoor = grid->ThisProcessorCoor();
|
||||||
|
|
||||||
|
// openqcd does rb on the z direction
|
||||||
|
gdim[Zdir] /= 2;
|
||||||
|
ldim[Zdir] /= 2;
|
||||||
|
|
||||||
|
// and has the order T X Y Z (from slowest to fastest)
|
||||||
|
std::swap(gdim[Xdir], gdim[Zdir]);
|
||||||
|
std::swap(ldim[Xdir], ldim[Zdir]);
|
||||||
|
std::swap(pcoor[Xdir], pcoor[Zdir]);
|
||||||
|
|
||||||
|
GridCartesian* ret = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
|
||||||
|
ret->_ldimensions = ldim;
|
||||||
|
ret->_processor_coor = pcoor;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vsimd>
|
||||||
|
static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
|
||||||
|
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
|
||||||
|
conformable(Umu.Grid(), Umu_ds.Grid());
|
||||||
|
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
|
||||||
|
|
||||||
|
// they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
|
||||||
|
for(int mu_g = 0; mu_g < Nd; ++mu_g) {
|
||||||
|
int mu_o = (mu_g + 1) % Nd;
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
|
||||||
|
+ Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
|
||||||
|
PokeIndex<LorentzIndex>(Umu, U, mu_g);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
281
Grid/parallelIO/OpenQcdIOChromaReference.h
Normal file
281
Grid/parallelIO/OpenQcdIOChromaReference.h
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
|
||||||
|
|
||||||
|
Copyright (C) 2015 - 2020
|
||||||
|
|
||||||
|
Author: Daniel Richtmann <daniel.richtmann@ur.de>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <ios>
|
||||||
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
|
#include <iomanip>
|
||||||
|
#include <mpi.h>
|
||||||
|
#include <ostream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
|
||||||
|
#define CHECK_VAR(a) { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
|
||||||
|
// #undef CHECK
|
||||||
|
// #define CHECK
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
class ParRdr {
|
||||||
|
private:
|
||||||
|
bool const swap;
|
||||||
|
|
||||||
|
MPI_Status status;
|
||||||
|
MPI_File fp;
|
||||||
|
|
||||||
|
int err;
|
||||||
|
|
||||||
|
MPI_Datatype oddSiteType;
|
||||||
|
MPI_Datatype fileViewType;
|
||||||
|
|
||||||
|
GridBase* grid;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
|
||||||
|
: swap(false)
|
||||||
|
, grid(gridPtr) {
|
||||||
|
err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
|
||||||
|
assert(err == MPI_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual ~ParRdr() { MPI_File_close(&fp); }
|
||||||
|
|
||||||
|
inline void errInfo(int const err, std::string const& func) {
|
||||||
|
static char estring[MPI_MAX_ERROR_STRING];
|
||||||
|
int eclass = -1, len = 0;
|
||||||
|
MPI_Error_class(err, &eclass);
|
||||||
|
MPI_Error_string(err, estring, &len);
|
||||||
|
std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
int readHeader(FieldMetaData& field) {
|
||||||
|
assert((grid->_ndimension == Nd) && (Nd == 4));
|
||||||
|
assert(Nc == 3);
|
||||||
|
|
||||||
|
OpenQcdHeader header;
|
||||||
|
|
||||||
|
readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
|
||||||
|
|
||||||
|
header.plaq /= 3.; // TODO change this into normalizationfactor
|
||||||
|
|
||||||
|
// sanity check (should trigger on endian issues) TODO remove?
|
||||||
|
assert(0 < header.Nt && header.Nt <= 1024);
|
||||||
|
assert(0 < header.Nx && header.Nx <= 1024);
|
||||||
|
assert(0 < header.Ny && header.Ny <= 1024);
|
||||||
|
assert(0 < header.Nz && header.Nz <= 1024);
|
||||||
|
|
||||||
|
field.dimension[0] = header.Nx;
|
||||||
|
field.dimension[1] = header.Ny;
|
||||||
|
field.dimension[2] = header.Nz;
|
||||||
|
field.dimension[3] = header.Nt;
|
||||||
|
|
||||||
|
for(int d = 0; d < Nd; d++)
|
||||||
|
assert(grid->FullDimensions()[d] == field.dimension[d]);
|
||||||
|
|
||||||
|
field.plaquette = header.plaq;
|
||||||
|
|
||||||
|
field.data_start = sizeof(OpenQcdHeader);
|
||||||
|
|
||||||
|
return field.data_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
|
||||||
|
err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
|
||||||
|
errInfo(err, "MPI_File_read_at_all");
|
||||||
|
// CHECK_VAR(err)
|
||||||
|
|
||||||
|
int read = -1;
|
||||||
|
MPI_Get_count(&status, datatype, &read);
|
||||||
|
// CHECK_VAR(read)
|
||||||
|
assert(nbytes == (uint64_t)read);
|
||||||
|
assert(err == MPI_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
void createTypes() {
|
||||||
|
constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
|
||||||
|
|
||||||
|
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
|
||||||
|
err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
|
||||||
|
|
||||||
|
Coordinate const L = grid->GlobalDimensions();
|
||||||
|
Coordinate const l = grid->LocalDimensions();
|
||||||
|
Coordinate const i = grid->ThisProcessorCoor();
|
||||||
|
|
||||||
|
Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
|
||||||
|
Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
|
||||||
|
Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
|
||||||
|
|
||||||
|
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
|
||||||
|
err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
void freeTypes() {
|
||||||
|
err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
|
||||||
|
err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
|
||||||
|
auto hdr_offset = readHeader(meta);
|
||||||
|
CHECK
|
||||||
|
createTypes();
|
||||||
|
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
|
||||||
|
CHECK
|
||||||
|
int const domainSites = grid->lSites();
|
||||||
|
domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
|
||||||
|
|
||||||
|
// the actual READ
|
||||||
|
constexpr uint64_t cm_size = 2 * Nc * Nc * sizeof(double); // 2_complex
|
||||||
|
constexpr uint64_t os_size = Nd * 2 * cm_size; // 2_fwdbwd
|
||||||
|
constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
|
||||||
|
uint64_t const n_os = domainSites / 2;
|
||||||
|
|
||||||
|
for(uint64_t os_idx = 0; os_idx < n_os;) {
|
||||||
|
uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
|
||||||
|
uint64_t const cm = os_idx * Nd * 2;
|
||||||
|
readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
|
||||||
|
os_idx += read_os;
|
||||||
|
}
|
||||||
|
|
||||||
|
CHECK
|
||||||
|
err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
|
||||||
|
errInfo(err, "MPI_File_set_view1");
|
||||||
|
assert(err == MPI_SUCCESS);
|
||||||
|
freeTypes();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class OpenQcdIOChromaReference : public BinaryIO {
|
||||||
|
public:
|
||||||
|
template<class vsimd>
|
||||||
|
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
|
||||||
|
Grid::FieldMetaData& header,
|
||||||
|
std::string file) {
|
||||||
|
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
|
||||||
|
|
||||||
|
assert(Ns == 4 and Nd == 4 and Nc == 3);
|
||||||
|
|
||||||
|
auto grid = Umu.Grid();
|
||||||
|
|
||||||
|
typedef ColourMatrixD fobj;
|
||||||
|
|
||||||
|
std::vector<fobj> iodata(
|
||||||
|
Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
|
||||||
|
|
||||||
|
{
|
||||||
|
ParRdr rdr(MPI_COMM_WORLD, file, grid);
|
||||||
|
rdr.readGauge(iodata, header);
|
||||||
|
} // equivalent to using binaryio
|
||||||
|
|
||||||
|
std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
|
||||||
|
|
||||||
|
copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
|
||||||
|
|
||||||
|
DoubledGaugeField Umu_ds(grid);
|
||||||
|
|
||||||
|
vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
|
||||||
|
|
||||||
|
redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
|
||||||
|
|
||||||
|
FieldMetaData clone(header);
|
||||||
|
|
||||||
|
GaugeStatistics(Umu, clone);
|
||||||
|
|
||||||
|
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
std::cout << GridLogMessage << "OpenQcd Configuration " << file
|
||||||
|
<< " plaquette " << clone.plaquette
|
||||||
|
<< " header " << header.plaquette
|
||||||
|
<< " difference " << plaq_diff
|
||||||
|
<< std::endl;
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
|
||||||
|
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
|
||||||
|
|
||||||
|
if(plaq_diff >= tol)
|
||||||
|
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
|
||||||
|
assert(plaq_diff < tol);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template<class vsimd>
|
||||||
|
static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
|
||||||
|
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
|
||||||
|
Grid::conformable(Umu.Grid(), Umu_ds.Grid());
|
||||||
|
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
|
||||||
|
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
|
||||||
|
std::vector<ColourMatrixD> const& node_buff,
|
||||||
|
GridBase* grid) {
|
||||||
|
assert(node_buff.size() == Nd * grid->lSites());
|
||||||
|
|
||||||
|
Coordinate const& l = grid->LocalDimensions();
|
||||||
|
|
||||||
|
Coordinate coord(Nd);
|
||||||
|
int& x = coord[0];
|
||||||
|
int& y = coord[1];
|
||||||
|
int& z = coord[2];
|
||||||
|
int& t = coord[3];
|
||||||
|
|
||||||
|
int buff_idx = 0;
|
||||||
|
for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
|
||||||
|
for(x = 0; x < l[0]; ++x)
|
||||||
|
for(y = 0; y < l[1]; ++y)
|
||||||
|
for(z = 0; z < l[2]; ++z) {
|
||||||
|
if((t + z + y + x) % 2 == 0) continue;
|
||||||
|
|
||||||
|
int local_idx;
|
||||||
|
Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
|
||||||
|
for(int mu = 0; mu < 2 * Nd; ++mu)
|
||||||
|
for(int c1 = 0; c1 < Nc; ++c1) {
|
||||||
|
for(int c2 = 0; c2 < Nc; ++c2) {
|
||||||
|
u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buff_idx += 2 * Nd;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(node_buff.size() == buff_idx);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
accelerator_inline uint64_t __rdtsc(void) { return 0; }
|
||||||
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
accelerator_inline uint64_t __rdpmc(int ) { return 0; }
|
||||||
#else
|
#else
|
||||||
@ -95,7 +95,8 @@ inline uint64_t cyclecount(void){
|
|||||||
}
|
}
|
||||||
#elif defined __x86_64__
|
#elif defined __x86_64__
|
||||||
inline uint64_t cyclecount(void){
|
inline uint64_t cyclecount(void){
|
||||||
return __rdtsc();
|
uint64_t ret = __rdtsc();
|
||||||
|
return (uint64_t)ret;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -111,7 +112,6 @@ class PerformanceCounter {
|
|||||||
private:
|
private:
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
public:
|
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint64_t config;
|
uint64_t config;
|
||||||
const char *name;
|
const char *name;
|
||||||
|
@ -110,15 +110,15 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
||||||
}
|
}
|
||||||
GridTime Elapsed(void) {
|
GridTime Elapsed(void) const {
|
||||||
assert(running == false);
|
assert(running == false);
|
||||||
return std::chrono::duration_cast<GridTime>( accumulator );
|
return std::chrono::duration_cast<GridTime>( accumulator );
|
||||||
}
|
}
|
||||||
uint64_t useconds(void){
|
uint64_t useconds(void) const {
|
||||||
assert(running == false);
|
assert(running == false);
|
||||||
return (uint64_t) accumulator.count();
|
return (uint64_t) accumulator.count();
|
||||||
}
|
}
|
||||||
bool isRunning(void){
|
bool isRunning(void) const {
|
||||||
return running;
|
return running;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -12773,7 +12773,7 @@ namespace pugi
|
|||||||
#undef PUGI__THROW_ERROR
|
#undef PUGI__THROW_ERROR
|
||||||
#undef PUGI__CHECK_ERROR
|
#undef PUGI__CHECK_ERROR
|
||||||
|
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -40,8 +40,8 @@ public:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
// override multiply
|
// override multiply
|
||||||
virtual RealD M (const FermionField &in, FermionField &out);
|
virtual void M (const FermionField &in, FermionField &out);
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out);
|
virtual void Mdag (const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
// half checkerboard operations
|
// half checkerboard operations
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||||
@ -141,7 +141,33 @@ public:
|
|||||||
Vector<iSinglet<Simd> > MatpInvDag;
|
Vector<iSinglet<Simd> > MatpInvDag;
|
||||||
Vector<iSinglet<Simd> > MatmInvDag;
|
Vector<iSinglet<Simd> > MatmInvDag;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Conserved current utilities
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Virtual can't template
|
||||||
|
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
|
PropagatorField &q_in_2,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu);
|
||||||
|
|
||||||
|
void SeqConservedCurrent(PropagatorField &q_in,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu,
|
||||||
|
unsigned int tmin,
|
||||||
|
unsigned int tmax,
|
||||||
|
ComplexField &lattice_cmplx);
|
||||||
|
|
||||||
|
void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
|
||||||
|
void ContractJ5q(FermionField &q_in,ComplexField &J5q);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
// Constructors
|
// Constructors
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
CayleyFermion5D(GaugeField &_Umu,
|
CayleyFermion5D(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
@ -41,8 +41,8 @@ public:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
// override multiply
|
// override multiply
|
||||||
virtual RealD M (const FermionField &in, FermionField &out);
|
virtual void M (const FermionField &in, FermionField &out);
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out);
|
virtual void Mdag (const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
// half checkerboard operaions
|
// half checkerboard operaions
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||||
|
@ -53,8 +53,8 @@ public:
|
|||||||
virtual void DtildeInv (const FermionField& in, FermionField& out);
|
virtual void DtildeInv (const FermionField& in, FermionField& out);
|
||||||
|
|
||||||
// override multiply
|
// override multiply
|
||||||
virtual RealD M (const FermionField& in, FermionField& out);
|
virtual void M (const FermionField& in, FermionField& out);
|
||||||
virtual RealD Mdag (const FermionField& in, FermionField& out);
|
virtual void Mdag (const FermionField& in, FermionField& out);
|
||||||
|
|
||||||
// half checkerboard operations
|
// half checkerboard operations
|
||||||
virtual void Mooee (const FermionField& in, FermionField& out);
|
virtual void Mooee (const FermionField& in, FermionField& out);
|
||||||
|
@ -115,18 +115,21 @@ public:
|
|||||||
PokeIndex<LorentzIndex>(Uadj, U, mu);
|
PokeIndex<LorentzIndex>(Uadj, U, mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
autoView(Umu_v,Umu,CpuRead);
|
||||||
|
autoView(Uadj_v,Uadj,CpuRead);
|
||||||
|
autoView(Uds_v,Uds,CpuWrite);
|
||||||
|
thread_for( lidx, GaugeGrid->lSites(), {
|
||||||
Coordinate lcoor;
|
Coordinate lcoor;
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
|
||||||
peekLocalSite(ScalarUmu, Umu, lcoor);
|
peekLocalSite(ScalarUmu, Umu_v, lcoor);
|
||||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||||
|
|
||||||
peekLocalSite(ScalarUmu, Uadj, lcoor);
|
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
|
||||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
||||||
|
|
||||||
pokeLocalSite(ScalarUds, Uds, lcoor);
|
pokeLocalSite(ScalarUds, Uds_v, lcoor);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
|
||||||
|
@ -57,6 +57,7 @@ NAMESPACE_CHECK(WilsonClover);
|
|||||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||||
NAMESPACE_CHECK(Wilson5D);
|
NAMESPACE_CHECK(Wilson5D);
|
||||||
|
|
||||||
|
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
|
||||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
|
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
|
||||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
|
||||||
NAMESPACE_CHECK(Staggered);
|
NAMESPACE_CHECK(Staggered);
|
||||||
@ -282,11 +283,15 @@ typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
|
|||||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
|
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
|
||||||
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
|
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
|
||||||
|
|
||||||
|
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
|
||||||
|
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
|
||||||
|
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
|
||||||
|
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||||
|
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_CUDA
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
||||||
|
@ -58,8 +58,8 @@ public:
|
|||||||
virtual GridBase *GaugeRedBlackGrid(void) =0;
|
virtual GridBase *GaugeRedBlackGrid(void) =0;
|
||||||
|
|
||||||
// override multiply
|
// override multiply
|
||||||
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
virtual void M (const FermionField &in, FermionField &out)=0;
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
virtual void Mdag (const FermionField &in, FermionField &out)=0;
|
||||||
|
|
||||||
// half checkerboard operaions
|
// half checkerboard operaions
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
||||||
@ -86,7 +86,6 @@ public:
|
|||||||
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
|
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
|
||||||
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
|
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag)=0;
|
||||||
|
|
||||||
|
|
||||||
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
|
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
|
||||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
@ -148,15 +147,19 @@ public:
|
|||||||
virtual void ContractConservedCurrent(PropagatorField &q_in_1,
|
virtual void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
PropagatorField &q_in_2,
|
PropagatorField &q_in_2,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)=0;
|
unsigned int mu)
|
||||||
|
{assert(0);};
|
||||||
virtual void SeqConservedCurrent(PropagatorField &q_in,
|
virtual void SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
unsigned int tmax,
|
unsigned int tmax,
|
||||||
ComplexField &lattice_cmplx)=0;
|
ComplexField &lattice_cmplx)
|
||||||
|
{assert(0);};
|
||||||
|
|
||||||
// Only reimplemented in Wilson5D
|
// Only reimplemented in Wilson5D
|
||||||
// Default to just a zero correlation function
|
// Default to just a zero correlation function
|
||||||
|
@ -38,6 +38,7 @@ public:
|
|||||||
static const bool isFundamental = Representation::isFundamental;
|
static const bool isFundamental = Representation::isFundamental;
|
||||||
static const int Nhcs = Options::Nhcs;
|
static const int Nhcs = Options::Nhcs;
|
||||||
static const bool LsVectorised=false;
|
static const bool LsVectorised=false;
|
||||||
|
static const bool isGparity=true;
|
||||||
|
|
||||||
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
|
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
@ -46,7 +47,7 @@ public:
|
|||||||
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
|
||||||
|
|
||||||
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Dimension>, Ns>, Ngp>;
|
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Dimension>, Ns>, Ngp>;
|
||||||
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>, Ngp>;
|
template <typename vtype> using iImplPropagator = iMatrix<iMatrix<iMatrix<vtype, Dimension>, Ns>, Ngp>;
|
||||||
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhs>, Ngp>;
|
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhs>, Ngp>;
|
||||||
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
|
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
|
||||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
|
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
|
||||||
@ -80,6 +81,7 @@ public:
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class _Spinor>
|
template<class _Spinor>
|
||||||
static accelerator_inline void multLink(_Spinor &phi,
|
static accelerator_inline void multLink(_Spinor &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
@ -94,11 +96,11 @@ public:
|
|||||||
int sl = St._simd_layout[direction];
|
int sl = St._simd_layout[direction];
|
||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef GRID_SIMT
|
||||||
_Spinor tmp;
|
_Spinor tmp;
|
||||||
|
|
||||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||||
int s = SIMTlane(Nsimd);
|
int s = acceleratorSIMTlane(Nsimd);
|
||||||
St.iCoorFromIindex(icoor,s);
|
St.iCoorFromIindex(icoor,s);
|
||||||
|
|
||||||
int mmu = mu % Nd;
|
int mmu = mu % Nd;
|
||||||
@ -191,6 +193,16 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class _SpinorField>
|
||||||
|
inline void multLinkField(_SpinorField & out,
|
||||||
|
const DoubledGaugeField &Umu,
|
||||||
|
const _SpinorField & phi,
|
||||||
|
int mu)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
template <class ref>
|
template <class ref>
|
||||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||||
{
|
{
|
||||||
@ -221,14 +233,16 @@ public:
|
|||||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto U_v = U.View();
|
{
|
||||||
auto Uds_v = Uds.View();
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uconj_v = Uconj.View();
|
autoView( Uconj_v , Uconj, CpuRead);
|
||||||
auto Utmp_v= Utmp.View();
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
thread_foreach(ss,U_v,{
|
thread_foreach(ss,U_v,{
|
||||||
Uds_v[ss](0)(mu) = U_v[ss]();
|
Uds_v[ss](0)(mu) = U_v[ss]();
|
||||||
Uds_v[ss](1)(mu) = Uconj_v[ss]();
|
Uds_v[ss](1)(mu) = Uconj_v[ss]();
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
||||||
Uconj = adj(Cshift(Uconj,mu,-1));
|
Uconj = adj(Cshift(Uconj,mu,-1));
|
||||||
@ -238,19 +252,25 @@ public:
|
|||||||
Utmp = where(coor==0,Uconj,Utmp);
|
Utmp = where(coor==0,Uconj,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
thread_foreach(ss,Utmp_v,{
|
thread_foreach(ss,Utmp_v,{
|
||||||
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
||||||
});
|
});
|
||||||
|
}
|
||||||
Utmp = Uconj;
|
Utmp = Uconj;
|
||||||
if ( Params.twists[mu] ) {
|
if ( Params.twists[mu] ) {
|
||||||
Utmp = where(coor==0,U,Utmp);
|
Utmp = where(coor==0,U,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
thread_foreach(ss,Utmp_v,{
|
thread_foreach(ss,Utmp_v,{
|
||||||
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
||||||
});
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -260,11 +280,14 @@ public:
|
|||||||
GaugeLinkField link(mat.Grid());
|
GaugeLinkField link(mat.Grid());
|
||||||
// use lorentz for flavour as hack.
|
// use lorentz for flavour as hack.
|
||||||
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
||||||
auto link_v = link.View();
|
|
||||||
auto tmp_v = tmp.View();
|
{
|
||||||
|
autoView( link_v , link, CpuWrite);
|
||||||
|
autoView( tmp_v , tmp, CpuRead);
|
||||||
thread_foreach(ss,tmp_v,{
|
thread_foreach(ss,tmp_v,{
|
||||||
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
|
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
|
||||||
});
|
});
|
||||||
|
}
|
||||||
PokeIndex<LorentzIndex>(mat, link, mu);
|
PokeIndex<LorentzIndex>(mat, link, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -294,9 +317,10 @@ public:
|
|||||||
|
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
auto tmp_v = tmp.View();
|
{
|
||||||
auto Atilde_v = Atilde.View();
|
autoView( tmp_v , tmp, CpuWrite);
|
||||||
auto Btilde_v = Btilde.View();
|
autoView( Atilde_v , Atilde, CpuRead);
|
||||||
|
autoView( Btilde_v , Btilde, CpuRead);
|
||||||
thread_for(ss,tmp.Grid()->oSites(),{
|
thread_for(ss,tmp.Grid()->oSites(),{
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
int sF = s + Ls * ss;
|
int sF = s + Ls * ss;
|
||||||
@ -304,6 +328,7 @@ public:
|
|||||||
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
}
|
||||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -71,8 +71,8 @@ public:
|
|||||||
// override multiply; cut number routines if pass dagger argument
|
// override multiply; cut number routines if pass dagger argument
|
||||||
// and also make interface more uniformly consistent
|
// and also make interface more uniformly consistent
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
RealD M(const FermionField &in, FermionField &out);
|
void M(const FermionField &in, FermionField &out);
|
||||||
RealD Mdag(const FermionField &in, FermionField &out);
|
void Mdag(const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// half checkerboard operations
|
// half checkerboard operations
|
||||||
@ -185,10 +185,12 @@ public:
|
|||||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
PropagatorField &q_in_2,
|
PropagatorField &q_in_2,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu);
|
unsigned int mu);
|
||||||
void SeqConservedCurrent(PropagatorField &q_in,
|
void SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &srct,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -74,8 +73,8 @@ public:
|
|||||||
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
|
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
|
||||||
|
|
||||||
// full checkerboard operations; leave unimplemented as abstract for now
|
// full checkerboard operations; leave unimplemented as abstract for now
|
||||||
RealD M (const FermionField &in, FermionField &out);
|
void M (const FermionField &in, FermionField &out);
|
||||||
RealD Mdag (const FermionField &in, FermionField &out);
|
void Mdag (const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
// half checkerboard operations
|
// half checkerboard operations
|
||||||
void Meooe (const FermionField &in, FermionField &out);
|
void Meooe (const FermionField &in, FermionField &out);
|
||||||
@ -217,10 +216,12 @@ public:
|
|||||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
PropagatorField &q_in_2,
|
PropagatorField &q_in_2,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu);
|
unsigned int mu);
|
||||||
void SeqConservedCurrent(PropagatorField &q_in,
|
void SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
|
@ -56,8 +56,8 @@ public:
|
|||||||
virtual void DtildeInv (const FermionField& in, FermionField& out);
|
virtual void DtildeInv (const FermionField& in, FermionField& out);
|
||||||
|
|
||||||
// override multiply
|
// override multiply
|
||||||
virtual RealD M (const FermionField& in, FermionField& out);
|
virtual void M (const FermionField& in, FermionField& out);
|
||||||
virtual RealD Mdag (const FermionField& in, FermionField& out);
|
virtual void Mdag (const FermionField& in, FermionField& out);
|
||||||
|
|
||||||
// half checkerboard operations
|
// half checkerboard operations
|
||||||
virtual void Mooee (const FermionField& in, FermionField& out);
|
virtual void Mooee (const FermionField& in, FermionField& out);
|
||||||
|
@ -59,7 +59,7 @@ public:
|
|||||||
{
|
{
|
||||||
RealD eps = 1.0;
|
RealD eps = 1.0;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
|
// std::cout<<GridLogMessage << "MobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" Tanh approx"<<std::endl;
|
||||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
||||||
assert(zdata->n==this->Ls);
|
assert(zdata->n==this->Ls);
|
||||||
|
|
||||||
|
194
Grid/qcd/action/fermion/NaiveStaggeredFermion.h
Normal file
194
Grid/qcd/action/fermion/NaiveStaggeredFermion.h
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi, Peter Boyle
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
|
||||||
|
#define GRID_QCD_NAIVE_STAG_FERMION_H
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
class NaiveStaggeredFermionStatic {
|
||||||
|
public:
|
||||||
|
static const std::vector<int> directions;
|
||||||
|
static const std::vector<int> displacements;
|
||||||
|
static const int npoint = 8;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
|
||||||
|
public:
|
||||||
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
|
typedef StaggeredKernels<Impl> Kernels;
|
||||||
|
|
||||||
|
FermionField _tmp;
|
||||||
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// Performance monitoring
|
||||||
|
////////////////////////////////////////
|
||||||
|
void Report(void);
|
||||||
|
void ZeroCounters(void);
|
||||||
|
double DhopTotalTime;
|
||||||
|
double DhopCalls;
|
||||||
|
double DhopCommTime;
|
||||||
|
double DhopComputeTime;
|
||||||
|
double DhopComputeTime2;
|
||||||
|
double DhopFaceTime;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Implement the abstract base
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
GridBase *GaugeGrid(void) { return _grid; }
|
||||||
|
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
|
||||||
|
GridBase *FermionGrid(void) { return _grid; }
|
||||||
|
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
// override multiply; cut number routines if pass dagger argument
|
||||||
|
// and also make interface more uniformly consistent
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
void M(const FermionField &in, FermionField &out);
|
||||||
|
void Mdag(const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
// half checkerboard operations
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
void Meooe(const FermionField &in, FermionField &out);
|
||||||
|
void MeooeDag(const FermionField &in, FermionField &out);
|
||||||
|
void Mooee(const FermionField &in, FermionField &out);
|
||||||
|
void MooeeDag(const FermionField &in, FermionField &out);
|
||||||
|
void MooeeInv(const FermionField &in, FermionField &out);
|
||||||
|
void MooeeInvDag(const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
////////////////////////
|
||||||
|
// Derivative interface
|
||||||
|
////////////////////////
|
||||||
|
// Interface calls an internal routine
|
||||||
|
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||||
|
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||||
|
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// non-hermitian hopping term; half cb or both
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
void Dhop (const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopOE(const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopEO(const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Multigrid assistance; force term uses too
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
|
||||||
|
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
||||||
|
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Extra methods added by derived
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
void DerivInternal(StencilImpl &st,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
GaugeField &mat,
|
||||||
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// Grid own interface Constructor
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
|
||||||
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
|
RealD _c1, RealD _u0,
|
||||||
|
const ImplParams &p = ImplParams());
|
||||||
|
NaiveStaggeredFermion(GridCartesian &Fgrid,
|
||||||
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
|
RealD _c1, RealD _u0,
|
||||||
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
|
// DoubleStore impl dependent
|
||||||
|
void ImportGauge (const GaugeField &_U );
|
||||||
|
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||||
|
void CopyGaugeCheckerboards(void);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Data members require to support the functionality
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// protected:
|
||||||
|
public:
|
||||||
|
// any other parameters of action ???
|
||||||
|
virtual int isTrivialEE(void) { return 1; };
|
||||||
|
virtual RealD Mass(void) { return mass; }
|
||||||
|
RealD mass;
|
||||||
|
RealD u0;
|
||||||
|
RealD c1;
|
||||||
|
|
||||||
|
GridBase *_grid;
|
||||||
|
GridBase *_cbgrid;
|
||||||
|
|
||||||
|
// Defines the stencils for even and odd
|
||||||
|
StencilImpl Stencil;
|
||||||
|
StencilImpl StencilEven;
|
||||||
|
StencilImpl StencilOdd;
|
||||||
|
|
||||||
|
// Copy of the gauge field , with even and odd subsets
|
||||||
|
DoubledGaugeField Umu;
|
||||||
|
DoubledGaugeField UmuEven;
|
||||||
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
// Conserved current utilities
|
||||||
|
///////////////////////////////////////////////////////////////
|
||||||
|
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
|
PropagatorField &q_in_2,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu);
|
||||||
|
void SeqConservedCurrent(PropagatorField &q_in,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &srct,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu,
|
||||||
|
unsigned int tmin,
|
||||||
|
unsigned int tmax,
|
||||||
|
ComplexField &lattice_cmplx);
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
|
||||||
|
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#endif
|
@ -47,8 +47,8 @@ public:
|
|||||||
void M_internal(const FermionField &in, FermionField &out,int dag);
|
void M_internal(const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
// override multiply
|
// override multiply
|
||||||
virtual RealD M (const FermionField &in, FermionField &out);
|
virtual void M (const FermionField &in, FermionField &out);
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out);
|
virtual void Mdag (const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
// half checkerboard operaions
|
// half checkerboard operaions
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||||
|
@ -49,21 +49,32 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
|
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
|
|
||||||
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp);
|
||||||
|
protected:
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic Nc kernels
|
// Generic Nc kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
template<int Naik> accelerator_inline
|
||||||
|
void DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
template<int Naik> accelerator_inline
|
||||||
|
void DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
template<int Naik> accelerator_inline
|
||||||
|
void DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
@ -71,15 +82,18 @@ public:
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Nc=3 specific kernels
|
// Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
template<int Naik> accelerator_inline
|
||||||
|
void DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
template<int Naik> accelerator_inline
|
||||||
|
void DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
template<int Naik> accelerator_inline
|
||||||
|
void DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
@ -87,27 +101,10 @@ public:
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Asm Nc=3 specific kernels
|
// Asm Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
void DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Generic interface; fan out to right routine
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
|
|
||||||
|
|
||||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionFieldView &in, FermionFieldView &out, int interior=1,int exterior=1);
|
|
||||||
|
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag, int interior,int exterior);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -113,20 +113,7 @@ public:
|
|||||||
|
|
||||||
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
||||||
{
|
{
|
||||||
GridBase *GaugeGrid = U_ds.Grid();
|
assert(0);
|
||||||
thread_for(lidx, GaugeGrid->lSites(),{
|
|
||||||
|
|
||||||
SiteScalarGaugeLink ScalarU;
|
|
||||||
SiteDoubledGaugeField ScalarUds;
|
|
||||||
|
|
||||||
Coordinate lcoor;
|
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
peekLocalSite(ScalarUds, U_ds, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, U, lcoor);
|
|
||||||
ScalarUds(mu) = ScalarU();
|
|
||||||
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
|
@ -109,9 +109,8 @@ public:
|
|||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual RealD M(const FermionField &in, FermionField &out);
|
virtual void M(const FermionField &in, FermionField &out);
|
||||||
virtual RealD Mdag(const FermionField &in, FermionField &out);
|
virtual void Mdag(const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
virtual void Mooee(const FermionField &in, FermionField &out);
|
virtual void Mooee(const FermionField &in, FermionField &out);
|
||||||
virtual void MooeeDag(const FermionField &in, FermionField &out);
|
virtual void MooeeDag(const FermionField &in, FermionField &out);
|
||||||
virtual void MooeeInv(const FermionField &in, FermionField &out);
|
virtual void MooeeInv(const FermionField &in, FermionField &out);
|
||||||
@ -258,15 +257,16 @@ private:
|
|||||||
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
||||||
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
||||||
|
|
||||||
|
public:
|
||||||
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
||||||
// using the DeGrand-Rossi basis for the gamma matrices
|
// using the DeGrand-Rossi basis for the gamma matrices
|
||||||
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
||||||
{
|
{
|
||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
auto T_v = T.View();
|
autoView(T_v,T,AcceleratorWrite);
|
||||||
auto F_v = F.View();
|
autoView(F_v,F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
||||||
@ -282,9 +282,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View();
|
autoView(T_v, T,AcceleratorWrite);
|
||||||
auto F_v = F.View();
|
autoView(F_v, F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -F_v[i]()();
|
T_v[i]()(0, 1) = -F_v[i]()();
|
||||||
T_v[i]()(1, 0) = F_v[i]()();
|
T_v[i]()(1, 0) = F_v[i]()();
|
||||||
@ -300,9 +300,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View();
|
autoView(T_v,T,AcceleratorWrite);
|
||||||
auto F_v = F.View();
|
autoView(F_v,F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
||||||
@ -318,9 +318,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View();
|
autoView( T_v , T, AcceleratorWrite);
|
||||||
auto F_v = F.View();
|
autoView( F_v , F, AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
||||||
@ -336,9 +336,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View();
|
autoView( T_v ,T,AcceleratorWrite);
|
||||||
auto F_v = F.View();
|
autoView( F_v ,F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -(F_v[i]()());
|
T_v[i]()(0, 1) = -(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = (F_v[i]()());
|
T_v[i]()(1, 0) = (F_v[i]()());
|
||||||
@ -355,9 +355,9 @@ private:
|
|||||||
|
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View();
|
autoView( T_v , T,AcceleratorWrite);
|
||||||
auto F_v = F.View();
|
autoView( F_v , F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
||||||
|
@ -92,8 +92,8 @@ public:
|
|||||||
// override multiply; cut number routines if pass dagger argument
|
// override multiply; cut number routines if pass dagger argument
|
||||||
// and also make interface more uniformly consistent
|
// and also make interface more uniformly consistent
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
virtual RealD M(const FermionField &in, FermionField &out);
|
virtual void M(const FermionField &in, FermionField &out);
|
||||||
virtual RealD Mdag(const FermionField &in, FermionField &out);
|
virtual void Mdag(const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// half checkerboard operations
|
// half checkerboard operations
|
||||||
@ -193,10 +193,12 @@ public:
|
|||||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
void ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
PropagatorField &q_in_2,
|
PropagatorField &q_in_2,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu);
|
unsigned int mu);
|
||||||
void SeqConservedCurrent(PropagatorField &q_in,
|
void SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -99,8 +98,8 @@ public:
|
|||||||
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
|
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
|
||||||
|
|
||||||
// full checkerboard operations; leave unimplemented as abstract for now
|
// full checkerboard operations; leave unimplemented as abstract for now
|
||||||
virtual RealD M (const FermionField &in, FermionField &out){assert(0); return 0.0;};
|
virtual void M (const FermionField &in, FermionField &out){assert(0);};
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out){assert(0); return 0.0;};
|
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
|
|
||||||
// half checkerboard operations; leave unimplemented as abstract for now
|
// half checkerboard operations; leave unimplemented as abstract for now
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
|
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
|
||||||
@ -218,24 +217,6 @@ public:
|
|||||||
// Comms buffer
|
// Comms buffer
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Conserved current utilities
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
void ContractConservedCurrent(PropagatorField &q_in_1,
|
|
||||||
PropagatorField &q_in_2,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu);
|
|
||||||
void SeqConservedCurrent(PropagatorField &q_in,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu,
|
|
||||||
unsigned int tmin,
|
|
||||||
unsigned int tmax,
|
|
||||||
ComplexField &lattice_cmplx);
|
|
||||||
|
|
||||||
void ContractJ5q(PropagatorField &q_in,ComplexField &J5q);
|
|
||||||
void ContractJ5q(FermionField &q_in,ComplexField &J5q);
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -41,6 +41,7 @@ public:
|
|||||||
static const int Dimension = Representation::Dimension;
|
static const int Dimension = Representation::Dimension;
|
||||||
static const bool isFundamental = Representation::isFundamental;
|
static const bool isFundamental = Representation::isFundamental;
|
||||||
static const bool LsVectorised=false;
|
static const bool LsVectorised=false;
|
||||||
|
static const bool isGparity=false;
|
||||||
static const int Nhcs = Options::Nhcs;
|
static const int Nhcs = Options::Nhcs;
|
||||||
|
|
||||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||||
@ -99,6 +100,19 @@ public:
|
|||||||
multLink(phi,U,chi,mu);
|
multLink(phi,U,chi,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class _SpinorField>
|
||||||
|
inline void multLinkField(_SpinorField & out,
|
||||||
|
const DoubledGaugeField &Umu,
|
||||||
|
const _SpinorField & phi,
|
||||||
|
int mu)
|
||||||
|
{
|
||||||
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
|
autoView( phi_v, phi, AcceleratorRead);
|
||||||
|
autoView( Umu_v, Umu, AcceleratorRead);
|
||||||
|
accelerator_for(sss,out.Grid()->oSites(),1,{
|
||||||
|
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
template <class ref>
|
template <class ref>
|
||||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||||
@ -177,18 +191,19 @@ public:
|
|||||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
int Ls=Btilde.Grid()->_fdimensions[0];
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
auto tmp_v = tmp.View();
|
{
|
||||||
auto Btilde_v = Btilde.View();
|
autoView( tmp_v , tmp, AcceleratorWrite);
|
||||||
auto Atilde_v = Atilde.View();
|
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||||
thread_for(sss,tmp.Grid()->oSites(),{
|
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||||
|
accelerator_for(sss,tmp.Grid()->oSites(),1,{
|
||||||
int sU=sss;
|
int sU=sss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
|
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
}
|
||||||
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -66,41 +66,6 @@ public:
|
|||||||
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
|
int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Utilities for inserting Wilson conserved current.
|
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
|
||||||
static void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
|
|
||||||
const SitePropagator &q_in_2,
|
|
||||||
SitePropagator &q_out,
|
|
||||||
DoubledGaugeFieldView &U,
|
|
||||||
unsigned int sU,
|
|
||||||
unsigned int mu,
|
|
||||||
bool switch_sign = false);
|
|
||||||
|
|
||||||
static void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
|
|
||||||
const SitePropagator &q_in_2,
|
|
||||||
SitePropagator &q_out,
|
|
||||||
DoubledGaugeFieldView &U,
|
|
||||||
unsigned int sU,
|
|
||||||
unsigned int mu,
|
|
||||||
bool switch_sign = false);
|
|
||||||
|
|
||||||
static void SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
|
|
||||||
SitePropagator &q_out,
|
|
||||||
DoubledGaugeFieldView &U,
|
|
||||||
unsigned int sU,
|
|
||||||
unsigned int mu,
|
|
||||||
vPredicate t_mask,
|
|
||||||
bool switch_sign = false);
|
|
||||||
|
|
||||||
static void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
|
|
||||||
SitePropagator &q_out,
|
|
||||||
DoubledGaugeFieldView &U,
|
|
||||||
unsigned int sU,
|
|
||||||
unsigned int mu,
|
|
||||||
vPredicate t_mask,
|
|
||||||
bool switch_sign = false);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
|
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor * buf,
|
||||||
|
@ -120,7 +120,8 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual RealD M(const FermionField &in, FermionField &out) {
|
virtual void M(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
this->Dhop(in, out, DaggerNo);
|
this->Dhop(in, out, DaggerNo);
|
||||||
FermionField tmp(out.Grid());
|
FermionField tmp(out.Grid());
|
||||||
@ -129,11 +130,12 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
ComplexD b(0.0,this->mu[s]);
|
ComplexD b(0.0,this->mu[s]);
|
||||||
axpbg5y_ssp(tmp,a,in,b,in,s,s);
|
axpbg5y_ssp(tmp,a,in,b,in,s,s);
|
||||||
}
|
}
|
||||||
return axpy_norm(out, 1.0, tmp, out);
|
axpy(out, 1.0, tmp, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
// needed for fast PV
|
// needed for fast PV
|
||||||
void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
|
void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu)
|
||||||
|
{
|
||||||
assert(_mass.size() == _mu.size());
|
assert(_mass.size() == _mu.size());
|
||||||
assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
|
assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
|
||||||
this->mass = _mass;
|
this->mass = _mass;
|
||||||
|
@ -180,7 +180,7 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
|
|||||||
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
|
||||||
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
|
||||||
#ifdef GRID_NVCC
|
#ifdef GRID_CUDA
|
||||||
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
@ -323,7 +323,7 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
FermionField Din(psi.Grid());
|
FermionField Din(psi.Grid());
|
||||||
|
|
||||||
@ -335,11 +335,10 @@ RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
|||||||
axpby(chi,1.0,1.0,chi,psi);
|
axpby(chi,1.0,1.0,chi,psi);
|
||||||
|
|
||||||
M5D(psi,chi);
|
M5D(psi,chi);
|
||||||
return(norm2(chi));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
// Under adjoint
|
// Under adjoint
|
||||||
//D1+ D1- P- -> D1+^dag P+ D2-^dag
|
//D1+ D1- P- -> D1+^dag P+ D2-^dag
|
||||||
@ -354,7 +353,6 @@ RealD CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
|||||||
M5Ddag(psi,chi);
|
M5Ddag(psi,chi);
|
||||||
// ((b D_W + D_w hop terms +1) on s-diag
|
// ((b D_W + D_w hop terms +1) on s-diag
|
||||||
axpby (chi,1.0,1.0,chi,psi);
|
axpby (chi,1.0,1.0,chi,psi);
|
||||||
return norm2(chi);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// half checkerboard operations
|
// half checkerboard operations
|
||||||
@ -588,6 +586,356 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
|
|||||||
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
|
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void CayleyFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
|
||||||
|
{
|
||||||
|
conformable(this->GaugeGrid(), J5q.Grid());
|
||||||
|
conformable(q_in.Grid(), this->FermionGrid());
|
||||||
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
|
// 4d field
|
||||||
|
int Ls = this->Ls;
|
||||||
|
FermionField psi(this->GaugeGrid());
|
||||||
|
FermionField p_plus (this->GaugeGrid());
|
||||||
|
FermionField p_minus(this->GaugeGrid());
|
||||||
|
FermionField p(this->GaugeGrid());
|
||||||
|
|
||||||
|
ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
|
||||||
|
ExtractSlice(p_minus, q_in, Ls/2 , 0);
|
||||||
|
p_plus = p_plus + G5*p_plus;
|
||||||
|
p_minus= p_minus - G5*p_minus;
|
||||||
|
p=0.5*(p_plus+p_minus);
|
||||||
|
J5q = localInnerProduct(p,p);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void CayleyFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
|
||||||
|
{
|
||||||
|
conformable(this->GaugeGrid(), J5q.Grid());
|
||||||
|
conformable(q_in.Grid(), this->FermionGrid());
|
||||||
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
|
// 4d field
|
||||||
|
int Ls = this->Ls;
|
||||||
|
PropagatorField psi(this->GaugeGrid());
|
||||||
|
PropagatorField p_plus (this->GaugeGrid());
|
||||||
|
PropagatorField p_minus(this->GaugeGrid());
|
||||||
|
PropagatorField p(this->GaugeGrid());
|
||||||
|
|
||||||
|
ExtractSlice(p_plus , q_in, Ls/2-1 , 0);
|
||||||
|
ExtractSlice(p_minus, q_in, Ls/2 , 0);
|
||||||
|
p_plus = p_plus + G5*p_plus;
|
||||||
|
p_minus= p_minus - G5*p_minus;
|
||||||
|
p=0.5*(p_plus+p_minus);
|
||||||
|
J5q = localInnerProduct(p,p);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define Pp(Q) (0.5*(Q+g5*Q))
|
||||||
|
#define Pm(Q) (0.5*(Q-g5*Q))
|
||||||
|
#define Q_4d(Q) (Pm((Q)[0]) + Pp((Q)[Ls-1]))
|
||||||
|
#define TopRowWithSource(Q) (phys_src + (1.0-mass)*Q_4d(Q))
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
||||||
|
PropagatorField &q_in_2,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu)
|
||||||
|
{
|
||||||
|
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT,
|
||||||
|
Gamma::Algebra::Gamma5
|
||||||
|
};
|
||||||
|
|
||||||
|
auto UGrid= this->GaugeGrid();
|
||||||
|
auto FGrid= this->FermionGrid();
|
||||||
|
RealD sgn=1.0;
|
||||||
|
if ( curr_type == Current::Axial ) sgn = -1.0;
|
||||||
|
|
||||||
|
int Ls = this->Ls;
|
||||||
|
|
||||||
|
std::vector<PropagatorField> L_Q(Ls,UGrid);
|
||||||
|
std::vector<PropagatorField> R_Q(Ls,UGrid);
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
ExtractSlice(L_Q[s], q_in_1, s , 0);
|
||||||
|
ExtractSlice(R_Q[s], q_in_2, s , 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
Gamma g5(Gamma::Algebra::Gamma5);
|
||||||
|
PropagatorField C(UGrid);
|
||||||
|
PropagatorField p5d(UGrid);
|
||||||
|
PropagatorField us_p5d(UGrid);
|
||||||
|
PropagatorField gp5d(UGrid);
|
||||||
|
PropagatorField gus_p5d(UGrid);
|
||||||
|
|
||||||
|
PropagatorField L_TmLsGq0(UGrid);
|
||||||
|
PropagatorField L_TmLsTmp(UGrid);
|
||||||
|
PropagatorField R_TmLsGq0(UGrid);
|
||||||
|
PropagatorField R_TmLsTmp(UGrid);
|
||||||
|
{
|
||||||
|
PropagatorField TermA(UGrid);
|
||||||
|
PropagatorField TermB(UGrid);
|
||||||
|
PropagatorField TermC(UGrid);
|
||||||
|
PropagatorField TermD(UGrid);
|
||||||
|
TermA = (Pp(Q_4d(L_Q)));
|
||||||
|
TermB = (Pm(Q_4d(L_Q)));
|
||||||
|
TermC = (Pm(TopRowWithSource(L_Q)));
|
||||||
|
TermD = (Pp(TopRowWithSource(L_Q)));
|
||||||
|
|
||||||
|
L_TmLsGq0 = (TermD - TermA + TermB);
|
||||||
|
L_TmLsTmp = (TermC - TermB + TermA);
|
||||||
|
|
||||||
|
TermA = (Pp(Q_4d(R_Q)));
|
||||||
|
TermB = (Pm(Q_4d(R_Q)));
|
||||||
|
TermC = (Pm(TopRowWithSource(R_Q)));
|
||||||
|
TermD = (Pp(TopRowWithSource(R_Q)));
|
||||||
|
|
||||||
|
R_TmLsGq0 = (TermD - TermA + TermB);
|
||||||
|
R_TmLsTmp = (TermC - TermB + TermA);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
|
||||||
|
std::vector<PropagatorField> L_TmLsGq(Ls,UGrid);
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
|
||||||
|
L_TmLsGq[s] = (Pm((L_Q)[(s)]) + Pp((L_Q)[((s)-1+Ls)%Ls]));
|
||||||
|
}
|
||||||
|
|
||||||
|
Gamma gmu=Gamma(Gmu[mu]);
|
||||||
|
|
||||||
|
q_out = Zero();
|
||||||
|
PropagatorField tmp(UGrid);
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
|
||||||
|
int sp = (s+1)%Ls;
|
||||||
|
int sr = Ls-1-s;
|
||||||
|
int srp= (sr+1)%Ls;
|
||||||
|
|
||||||
|
// Mobius parameters
|
||||||
|
auto b=this->bs[s];
|
||||||
|
auto c=this->cs[s];
|
||||||
|
auto bpc = 1.0/(b+c); // -0.5 factor in gauge links
|
||||||
|
if (s == 0) {
|
||||||
|
p5d =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp) + c*Pm(L_TmLsTmp ));
|
||||||
|
tmp =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
|
||||||
|
} else if (s == Ls-1) {
|
||||||
|
p5d =(b*Pm(L_TmLsGq0) + c*Pp(L_TmLsGq0 ) + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
|
||||||
|
tmp =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
|
||||||
|
} else {
|
||||||
|
p5d =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr])+ b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
|
||||||
|
tmp =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
|
||||||
|
}
|
||||||
|
tmp = Cshift(tmp,mu,1);
|
||||||
|
Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
|
||||||
|
|
||||||
|
gp5d=g5*p5d*g5;
|
||||||
|
gus_p5d=gmu*us_p5d;
|
||||||
|
|
||||||
|
C = bpc*(adj(gp5d)*us_p5d);
|
||||||
|
C-= bpc*(adj(gp5d)*gus_p5d);
|
||||||
|
|
||||||
|
if (s == 0) {
|
||||||
|
p5d =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
|
||||||
|
tmp =(b*Pm(L_TmLsGq[Ls-1])+ c*Pp(L_TmLsGq[Ls-1]) + b*Pp(L_TmLsTmp) + c*Pm(L_TmLsTmp ));
|
||||||
|
} else if (s == Ls-1) {
|
||||||
|
p5d =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
|
||||||
|
tmp =(b*Pm(L_TmLsGq0) + c*Pp(L_TmLsGq0 ) + b*Pp(L_TmLsGq[1]) + c*Pm(L_TmLsGq[1]));
|
||||||
|
} else {
|
||||||
|
p5d =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
|
||||||
|
tmp =(b*Pm(L_TmLsGq[sr]) + c*Pp(L_TmLsGq[sr]) + b*Pp(L_TmLsGq[srp])+ c*Pm(L_TmLsGq[srp]));
|
||||||
|
}
|
||||||
|
tmp = Cshift(tmp,mu,1);
|
||||||
|
Impl::multLinkField(us_p5d,this->Umu,tmp,mu);
|
||||||
|
|
||||||
|
gp5d=gmu*p5d;
|
||||||
|
gus_p5d=g5*us_p5d*g5;
|
||||||
|
|
||||||
|
C-= bpc*(adj(gus_p5d)*gp5d);
|
||||||
|
C-= bpc*(adj(gus_p5d)*p5d);
|
||||||
|
|
||||||
|
if (s < Ls/2) q_out += sgn*C;
|
||||||
|
else q_out += C;
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &phys_src,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu,
|
||||||
|
unsigned int tmin,
|
||||||
|
unsigned int tmax,
|
||||||
|
ComplexField &ph)// Complex phase factor
|
||||||
|
{
|
||||||
|
assert(mu>=0);
|
||||||
|
assert(mu<Nd);
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
// SHAMIR CASE
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
int Ls = this->Ls;
|
||||||
|
auto UGrid= this->GaugeGrid();
|
||||||
|
auto FGrid= this->FermionGrid();
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT
|
||||||
|
};
|
||||||
|
Gamma gmu=Gamma(Gmu[mu]);
|
||||||
|
|
||||||
|
PropagatorField L_Q(UGrid);
|
||||||
|
PropagatorField R_Q(UGrid);
|
||||||
|
|
||||||
|
PropagatorField tmp(UGrid);
|
||||||
|
PropagatorField Utmp(UGrid);
|
||||||
|
LatticeInteger zz (UGrid); zz=0.0;
|
||||||
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
|
for (int s=0;s<Ls;s++) {
|
||||||
|
|
||||||
|
RealD G_s = (curr_type == Current::Axial ) ? ((s < Ls/2) ? -1 : 1) : 1;
|
||||||
|
|
||||||
|
ExtractSlice(R_Q, q_in, s , 0);
|
||||||
|
|
||||||
|
tmp = Cshift(R_Q,mu,1);
|
||||||
|
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
|
||||||
|
tmp = G_s*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
|
||||||
|
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
|
||||||
|
tmp = where((lcoor<=tmax),tmp,zz);
|
||||||
|
L_Q = tmp;
|
||||||
|
|
||||||
|
tmp = R_Q*ph;
|
||||||
|
tmp = Cshift(tmp,mu,-1);
|
||||||
|
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd);// Adjoint link
|
||||||
|
tmp = -G_s*( Utmp + gmu*Utmp );
|
||||||
|
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
|
||||||
|
tmp = where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
|
||||||
|
L_Q= L_Q+tmp;
|
||||||
|
|
||||||
|
InsertSlice(L_Q, q_out, s , 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||||
|
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
// GENERAL CAYLEY CASE
|
||||||
|
////////////////////////////////////////////////
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT,
|
||||||
|
Gamma::Algebra::Gamma5
|
||||||
|
};
|
||||||
|
Gamma gmu=Gamma(Gmu[mu]);
|
||||||
|
Gamma g5(Gamma::Algebra::Gamma5);
|
||||||
|
|
||||||
|
int Ls = this->Ls;
|
||||||
|
auto UGrid= this->GaugeGrid();
|
||||||
|
auto FGrid= this->FermionGrid();
|
||||||
|
|
||||||
|
std::vector<PropagatorField> R_Q(Ls,UGrid);
|
||||||
|
PropagatorField L_Q(UGrid);
|
||||||
|
PropagatorField tmp(UGrid);
|
||||||
|
PropagatorField Utmp(UGrid);
|
||||||
|
|
||||||
|
LatticeInteger zz (UGrid); zz=0.0;
|
||||||
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
|
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
ExtractSlice(R_Q[s], q_in, s , 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
PropagatorField R_TmLsGq0(UGrid);
|
||||||
|
PropagatorField R_TmLsTmp(UGrid);
|
||||||
|
{
|
||||||
|
PropagatorField TermA(UGrid);
|
||||||
|
PropagatorField TermB(UGrid);
|
||||||
|
PropagatorField TermC(UGrid);
|
||||||
|
PropagatorField TermD(UGrid);
|
||||||
|
|
||||||
|
TermA = (Pp(Q_4d(R_Q)));
|
||||||
|
TermB = (Pm(Q_4d(R_Q)));
|
||||||
|
TermC = (Pm(TopRowWithSource(R_Q)));
|
||||||
|
TermD = (Pp(TopRowWithSource(R_Q)));
|
||||||
|
|
||||||
|
R_TmLsGq0 = (TermD - TermA + TermB);
|
||||||
|
R_TmLsTmp = (TermC - TermB + TermA);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<PropagatorField> R_TmLsGq(Ls,UGrid);
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
R_TmLsGq[s] = (Pm((R_Q)[(s)]) + Pp((R_Q)[((s)-1+Ls)%Ls]));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<RealD> G_s(Ls,1.0);
|
||||||
|
if ( curr_type == Current::Axial ) {
|
||||||
|
for(int s=0;s<Ls/2;s++){
|
||||||
|
G_s[s] = -1.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
|
||||||
|
int sp = (s+1)%Ls;
|
||||||
|
int sr = Ls-1-s;
|
||||||
|
int srp= (sr+1)%Ls;
|
||||||
|
|
||||||
|
// Mobius parameters
|
||||||
|
auto b=this->bs[s];
|
||||||
|
auto c=this->cs[s];
|
||||||
|
// auto bpc = G_s[s]*1.0/(b+c); // -0.5 factor in gauge links
|
||||||
|
|
||||||
|
if (s == 0) {
|
||||||
|
tmp =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
|
||||||
|
} else if (s == Ls-1) {
|
||||||
|
tmp =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
|
||||||
|
} else {
|
||||||
|
tmp =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp ])+ c*Pm(R_TmLsGq[sp]));
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = Cshift(tmp,mu,1);
|
||||||
|
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
|
||||||
|
tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
|
||||||
|
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
|
||||||
|
L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated
|
||||||
|
|
||||||
|
if (s == 0) {
|
||||||
|
tmp =(b*Pm(R_TmLsGq0) + c*Pp(R_TmLsGq0 ) + b*Pp(R_TmLsGq[1]) + c*Pm(R_TmLsGq[1]));
|
||||||
|
} else if (s == Ls-1) {
|
||||||
|
tmp =(b*Pm(R_TmLsGq[Ls-1])+ c*Pp(R_TmLsGq[Ls-1]) + b*Pp(R_TmLsTmp) + c*Pm(R_TmLsTmp ));
|
||||||
|
} else {
|
||||||
|
tmp =(b*Pm(R_TmLsGq[s]) + c*Pp(R_TmLsGq[s]) + b*Pp(R_TmLsGq[sp])+ c*Pm(R_TmLsGq[sp]));
|
||||||
|
}
|
||||||
|
tmp = tmp *ph;
|
||||||
|
tmp = Cshift(tmp,mu,-1);
|
||||||
|
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
|
||||||
|
tmp = -G_s[s]*( Utmp + gmu*Utmp );
|
||||||
|
tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
|
||||||
|
L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
|
||||||
|
|
||||||
|
InsertSlice(L_Q, q_out, s , 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#undef Pp
|
||||||
|
#undef Pm
|
||||||
|
#undef Q_4d
|
||||||
|
#undef TopRowWithSource
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
|
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
|
||||||
|
@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
|
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto phi = phi_i.View();
|
autoView(phi , phi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto phi = phi_i.View();
|
autoView(phi , phi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
|||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
|
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
|||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & lee [0];
|
auto plee = & lee [0];
|
||||||
auto pdee = & dee [0];
|
auto pdee = & dee [0];
|
||||||
|
@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi = psi_i.View();
|
autoView(psi, psi_i,CpuRead);
|
||||||
auto phi = phi_i.View();
|
autoView(phi, phi_i,CpuRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi, chi_i,CpuWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
const int nsimd= Simd::Nsimd();
|
const int nsimd= Simd::Nsimd();
|
||||||
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi=psi_i.View();
|
autoView(psi,psi_i,CpuRead);
|
||||||
auto phi=phi_i.View();
|
autoView(phi,phi_i,CpuRead);
|
||||||
auto chi=chi_i.View();
|
autoView(chi,chi_i,CpuWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
int nsimd= Simd::Nsimd();
|
int nsimd= Simd::Nsimd();
|
||||||
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
|
|||||||
Vector<iSinglet<Simd> > &Matm)
|
Vector<iSinglet<Simd> > &Matm)
|
||||||
{
|
{
|
||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i,CpuRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i,CpuWrite);
|
||||||
#ifndef AVX512
|
#ifndef AVX512
|
||||||
{
|
{
|
||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
|||||||
EnableIf<Impl::LsVectorised,int> sfinae=0;
|
EnableIf<Impl::LsVectorised,int> sfinae=0;
|
||||||
#ifndef AVX512
|
#ifndef AVX512
|
||||||
{
|
{
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i,CpuRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i,CpuWrite);
|
||||||
|
|
||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
SiteHalfSpinor BcastM;
|
SiteHalfSpinor BcastM;
|
||||||
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
{
|
{
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i,CpuRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i,CpuWrite);
|
||||||
// pointers
|
// pointers
|
||||||
// MASK_REGS;
|
// MASK_REGS;
|
||||||
#define Chi_00 %zmm0
|
#define Chi_00 %zmm0
|
||||||
|
@ -94,7 +94,7 @@ void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Ap
|
|||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
void ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
@ -116,15 +116,14 @@ RealD ContinuedFractionFermion5D<Impl>::M (const FermionField &psi, F
|
|||||||
}
|
}
|
||||||
sign=-sign;
|
sign=-sign;
|
||||||
}
|
}
|
||||||
return norm2(chi);
|
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD ContinuedFractionFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
void ContinuedFractionFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
// This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
|
// This matrix is already hermitian. (g5 Dw) = Dw dag g5 = (g5 Dw)dag
|
||||||
// The rest of matrix is symmetric.
|
// The rest of matrix is symmetric.
|
||||||
// Can ignore "dag"
|
// Can ignore "dag"
|
||||||
return M(psi,chi);
|
M(psi,chi);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
|
void ContinuedFractionFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
|
||||||
|
@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
auto phi = phi_i.View();
|
autoView( phi , phi_i, AcceleratorRead);
|
||||||
auto psi = psi_i.View();
|
autoView( psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &upper[0];
|
auto pupper = &upper[0];
|
||||||
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
|||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto psi = psi_i.View();
|
autoView( psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View();
|
autoView( phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &upper[0];
|
auto pupper = &upper[0];
|
||||||
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
auto psi=psi_i.View();
|
autoView( psi, psi_i, AcceleratorRead);
|
||||||
auto chi=chi_i.View();
|
autoView( chi, chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->lee[0];
|
auto plee = & this->lee[0];
|
||||||
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
auto psi = psi_i.View();
|
autoView( psi, psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView( chi, chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->lee[0];
|
auto plee = & this->lee[0];
|
||||||
|
@ -89,7 +89,7 @@ void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionFiel
|
|||||||
/*****************************************************************************************************/
|
/*****************************************************************************************************/
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
void DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
||||||
{
|
{
|
||||||
FermionField Din(psi.Grid());
|
FermionField Din(psi.Grid());
|
||||||
|
|
||||||
@ -97,11 +97,10 @@ RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
|||||||
this->DW(Din, chi, DaggerNo);
|
this->DW(Din, chi, DaggerNo);
|
||||||
axpby(chi, 1.0, 1.0, chi, psi);
|
axpby(chi, 1.0, 1.0, chi, psi);
|
||||||
this->M5D(psi, chi);
|
this->M5D(psi, chi);
|
||||||
return(norm2(chi));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
void DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
||||||
{
|
{
|
||||||
FermionField Din(psi.Grid());
|
FermionField Din(psi.Grid());
|
||||||
|
|
||||||
@ -109,7 +108,6 @@ RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& c
|
|||||||
this->MeooeDag5D(Din, chi);
|
this->MeooeDag5D(Din, chi);
|
||||||
this->M5Ddag(psi, chi);
|
this->M5Ddag(psi, chi);
|
||||||
axpby(chi, 1.0, 1.0, chi, psi);
|
axpby(chi, 1.0, 1.0, chi, psi);
|
||||||
return(norm2(chi));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/********************************************************************
|
/********************************************************************
|
||||||
|
@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
|
|||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
auto Umu_v = Umu.View();
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
auto UUUmu_v = UUUmu.View();
|
autoView( UUUmu_v , UUUmu, CpuRead);
|
||||||
auto in_v = in.View();
|
autoView( in_v , in, CpuRead);
|
||||||
auto out_v = out.View();
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for( ss,Umu.Grid()->oSites(),{
|
thread_for( ss,Umu.Grid()->oSites(),{
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
@ -281,11 +281,9 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
#endif
|
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -294,9 +292,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
@ -305,99 +301,42 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.Prepare();
|
st.Prepare();
|
||||||
st.HaloGather(in,compressor);
|
st.HaloGather(in,compressor);
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
DhopCommTime -=usecond();
|
||||||
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
|
st.CommunicateBegin(requests);
|
||||||
|
|
||||||
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
double ctime=0;
|
|
||||||
double ptime=0;
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
// Remove explicit thread mapping introduced for OPA reasons.
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
DhopComputeTime-=usecond();
|
||||||
{
|
{
|
||||||
int tid = omp_get_thread_num();
|
int interior=1;
|
||||||
int nthreads = omp_get_num_threads();
|
int exterior=0;
|
||||||
int ncomms = CartesianCommunicator::nCommThreads;
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
if (ncomms == -1) ncomms = 1;
|
|
||||||
assert(nthreads > ncomms);
|
|
||||||
if (tid >= ncomms) {
|
|
||||||
double start = usecond();
|
|
||||||
nthreads -= ncomms;
|
|
||||||
int ttid = tid - ncomms;
|
|
||||||
int n = U.Grid()->oSites(); // 4d vol
|
|
||||||
int chunk = n / nthreads;
|
|
||||||
int rem = n % nthreads;
|
|
||||||
int myblock, myn;
|
|
||||||
if (ttid < rem) {
|
|
||||||
myblock = ttid * chunk + ttid;
|
|
||||||
myn = chunk+1;
|
|
||||||
} else {
|
|
||||||
myblock = ttid*chunk + rem;
|
|
||||||
myn = chunk;
|
|
||||||
}
|
}
|
||||||
|
DhopComputeTime+=usecond();
|
||||||
// do the compute
|
|
||||||
auto U_v = U.View();
|
|
||||||
auto UUU_v = UUU.View();
|
|
||||||
auto in_v = in.View();
|
|
||||||
auto out_v = out.View();
|
|
||||||
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
int sU = ss;
|
|
||||||
// Interior = 1; Exterior = 0; must implement for staggered
|
|
||||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<---------
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
// Interior = 1; Exterior = 0;
|
|
||||||
int sU = ss;
|
|
||||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,1,0); //<------------
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ptime = usecond() - start;
|
|
||||||
} else {
|
|
||||||
double start = usecond();
|
|
||||||
st.CommunicateThreaded();
|
|
||||||
ctime = usecond() - start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
DhopCommTime += ctime;
|
|
||||||
DhopComputeTime+=ptime;
|
|
||||||
|
|
||||||
// First to enter, last to leave timing
|
|
||||||
st.CollateThreads();
|
|
||||||
|
|
||||||
DhopFaceTime-=usecond();
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
DhopFaceTime+=usecond();
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
DhopComputeTime2-=usecond();
|
st.CommunicateComplete(requests);
|
||||||
|
DhopCommTime +=usecond();
|
||||||
|
|
||||||
auto U_v = U.View();
|
DhopComputeTime2-=usecond();
|
||||||
auto UUU_v = UUU.View();
|
{
|
||||||
auto in_v = in.View();
|
int interior=0;
|
||||||
auto out_v = out.View();
|
int exterior=1;
|
||||||
if (dag == DaggerYes) {
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
int sz=st.surface_list.size();
|
|
||||||
thread_for( ss,sz,{
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1); //<----------
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
int sz=st.surface_list.size();
|
|
||||||
thread_for( ss,sz,{
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v,0,1);//<----------
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
DhopComputeTime2+=usecond();
|
DhopComputeTime2+=usecond();
|
||||||
#else
|
|
||||||
assert(0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -408,8 +347,6 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//double t1=usecond();
|
//double t1=usecond();
|
||||||
DhopTotalTime -= usecond();
|
DhopTotalTime -= usecond();
|
||||||
DhopCommTime -= usecond();
|
DhopCommTime -= usecond();
|
||||||
@ -418,28 +355,13 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
|
|
||||||
DhopComputeTime -= usecond();
|
DhopComputeTime -= usecond();
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
auto U_v = U.View();
|
{
|
||||||
auto UUU_v = UUU.View();
|
int interior=1;
|
||||||
auto in_v = in.View();
|
int exterior=1;
|
||||||
auto out_v = out.View();
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
if (dag == DaggerYes) {
|
|
||||||
thread_for( ss,U.Grid()->oSites(),{
|
|
||||||
int sU=ss;
|
|
||||||
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), LLs, sU,in_v, out_v);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
thread_for( ss,U.Grid()->oSites(),{
|
|
||||||
int sU=ss;
|
|
||||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),LLs,sU,in_v,out_v);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
//double t2=usecond();
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
|
||||||
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
/*CHANGE END*/
|
/*CHANGE END*/
|
||||||
@ -548,21 +470,24 @@ void ImprovedStaggeredFermion5D<Impl>::MdirAll(const FermionField &in, std::vect
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerNo);
|
Dhop(in, out, DaggerNo);
|
||||||
return axpy_norm(out, mass, in, out);
|
axpy(out, mass, in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerYes);
|
Dhop(in, out, DaggerYes);
|
||||||
return axpy_norm(out, mass, in, out);
|
axpy(out, mass, in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerNo);
|
DhopEO(in, out, DaggerNo);
|
||||||
} else {
|
} else {
|
||||||
@ -570,7 +495,8 @@ void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionFiel
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerYes);
|
DhopEO(in, out, DaggerYes);
|
||||||
} else {
|
} else {
|
||||||
@ -579,27 +505,30 @@ void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionF
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
typename FermionField::scalar_type scal(mass);
|
typename FermionField::scalar_type scal(mass);
|
||||||
out = scal * in;
|
out = scal * in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Mooee(in, out);
|
Mooee(in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
out = (1.0 / (mass)) * in;
|
out = (1.0 / (mass)) * in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
|
void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,FermionField &out)
|
||||||
FermionField &out) {
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
MooeeInv(in, out);
|
MooeeInv(in, out);
|
||||||
}
|
}
|
||||||
@ -611,6 +540,7 @@ template <class Impl>
|
|||||||
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
PropagatorField &q_in_2,
|
PropagatorField &q_in_2,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)
|
unsigned int mu)
|
||||||
{
|
{
|
||||||
@ -620,6 +550,7 @@ void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
|
@ -171,21 +171,24 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerNo);
|
Dhop(in, out, DaggerNo);
|
||||||
return axpy_norm(out, mass, in, out);
|
axpy(out, mass, in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerYes);
|
Dhop(in, out, DaggerYes);
|
||||||
return axpy_norm(out, mass, in, out);
|
axpy(out, mass, in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerNo);
|
DhopEO(in, out, DaggerNo);
|
||||||
} else {
|
} else {
|
||||||
@ -193,7 +196,8 @@ void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerYes);
|
DhopEO(in, out, DaggerYes);
|
||||||
} else {
|
} else {
|
||||||
@ -202,27 +206,30 @@ void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionFie
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
typename FermionField::scalar_type scal(mass);
|
typename FermionField::scalar_type scal(mass);
|
||||||
out = scal * in;
|
out = scal * in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Mooee(in, out);
|
Mooee(in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
out = (1.0 / (mass)) * in;
|
out = (1.0 / (mass)) * in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
|
void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,FermionField &out)
|
||||||
FermionField &out) {
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
MooeeInv(in, out);
|
MooeeInv(in, out);
|
||||||
}
|
}
|
||||||
@ -234,7 +241,8 @@ void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
GaugeField & mat,
|
GaugeField & mat,
|
||||||
const FermionField &A, const FermionField &B, int dag) {
|
const FermionField &A, const FermionField &B, int dag)
|
||||||
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
@ -250,10 +258,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
// Call the single hop
|
// Call the single hop
|
||||||
////////////////////////
|
////////////////////////
|
||||||
auto U_v = U.View();
|
autoView( U_v , U, CpuRead);
|
||||||
auto UUU_v = UUU.View();
|
autoView( UUU_v , UUU, CpuRead);
|
||||||
auto B_v = B.View();
|
autoView( B_v , B, CpuWrite);
|
||||||
auto Btilde_v = Btilde.View();
|
autoView( Btilde_v , Btilde, CpuWrite);
|
||||||
thread_for(sss,B.Grid()->oSites(),{
|
thread_for(sss,B.Grid()->oSites(),{
|
||||||
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||||
});
|
});
|
||||||
@ -284,8 +292,8 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
|
{
|
||||||
conformable(U.Grid(), _grid);
|
conformable(U.Grid(), _grid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
conformable(U.Grid(), mat.Grid());
|
conformable(U.Grid(), mat.Grid());
|
||||||
@ -296,8 +304,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionFie
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
|
{
|
||||||
conformable(U.Grid(), _cbgrid);
|
conformable(U.Grid(), _cbgrid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
conformable(U.Grid(), mat.Grid());
|
conformable(U.Grid(), mat.Grid());
|
||||||
@ -310,8 +318,8 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionF
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
|
{
|
||||||
conformable(U.Grid(), _cbgrid);
|
conformable(U.Grid(), _cbgrid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
conformable(U.Grid(), mat.Grid());
|
conformable(U.Grid(), mat.Grid());
|
||||||
@ -378,10 +386,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
|||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
auto Umu_v = Umu.View();
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
auto UUUmu_v = UUUmu.View();
|
autoView( UUUmu_v , UUUmu, CpuRead);
|
||||||
auto in_v = in.View();
|
autoView( in_v , in, CpuRead);
|
||||||
auto out_v = out.View();
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for( sss, in.Grid()->oSites(),{
|
thread_for( sss, in.Grid()->oSites(),{
|
||||||
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||||
});
|
});
|
||||||
@ -395,11 +403,9 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
|||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
#endif
|
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -409,7 +415,6 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
int len = U.Grid()->oSites();
|
int len = U.Grid()->oSites();
|
||||||
|
|
||||||
@ -418,60 +423,30 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
DhopFaceTime -= usecond();
|
DhopFaceTime -= usecond();
|
||||||
st.Prepare();
|
st.Prepare();
|
||||||
st.HaloGather(in,compressor);
|
st.HaloGather(in,compressor);
|
||||||
|
DhopFaceTime += usecond();
|
||||||
|
|
||||||
|
DhopCommTime -=usecond();
|
||||||
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
|
st.CommunicateBegin(requests);
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMergeSHM(compressor);
|
st.CommsMergeSHM(compressor);
|
||||||
DhopFaceTime+= usecond();
|
DhopFaceTime+= usecond();
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Ugly explicit thread mapping introduced for OPA reasons.
|
// Removed explicit thread comms
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
DhopComputeTime -= usecond();
|
DhopComputeTime -= usecond();
|
||||||
#pragma omp parallel
|
|
||||||
{
|
{
|
||||||
int tid = omp_get_thread_num();
|
int interior=1;
|
||||||
int nthreads = omp_get_num_threads();
|
int exterior=0;
|
||||||
int ncomms = CartesianCommunicator::nCommThreads;
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
if (ncomms == -1) ncomms = 1;
|
|
||||||
assert(nthreads > ncomms);
|
|
||||||
|
|
||||||
if (tid >= ncomms) {
|
|
||||||
nthreads -= ncomms;
|
|
||||||
int ttid = tid - ncomms;
|
|
||||||
int n = len;
|
|
||||||
int chunk = n / nthreads;
|
|
||||||
int rem = n % nthreads;
|
|
||||||
int myblock, myn;
|
|
||||||
if (ttid < rem) {
|
|
||||||
myblock = ttid * chunk + ttid;
|
|
||||||
myn = chunk+1;
|
|
||||||
} else {
|
|
||||||
myblock = ttid*chunk + rem;
|
|
||||||
myn = chunk;
|
|
||||||
}
|
|
||||||
|
|
||||||
// do the compute
|
|
||||||
auto U_v = U.View();
|
|
||||||
auto UUU_v = UUU.View();
|
|
||||||
auto in_v = in.View();
|
|
||||||
auto out_v = out.View();
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
int sU = ss;
|
|
||||||
// Interior = 1; Exterior = 0; must implement for staggered
|
|
||||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
|
||||||
// Interior = 1; Exterior = 0;
|
|
||||||
int sU = ss;
|
|
||||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,1,0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
st.CommunicateThreaded();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
|
|
||||||
|
st.CommunicateComplete(requests);
|
||||||
|
DhopCommTime +=usecond();
|
||||||
|
|
||||||
// First to enter, last to leave timing
|
// First to enter, last to leave timing
|
||||||
DhopFaceTime -= usecond();
|
DhopFaceTime -= usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
@ -479,28 +454,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
|
|
||||||
DhopComputeTime2 -= usecond();
|
DhopComputeTime2 -= usecond();
|
||||||
{
|
{
|
||||||
auto U_v = U.View();
|
int interior=0;
|
||||||
auto UUU_v = UUU.View();
|
int exterior=1;
|
||||||
auto in_v = in.View();
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
auto out_v = out.View();
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
int sz=st.surface_list.size();
|
|
||||||
thread_for(ss,sz,{
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSiteDag(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
int sz=st.surface_list.size();
|
|
||||||
thread_for(ss,sz,{
|
|
||||||
int sU = st.surface_list[ss];
|
|
||||||
Kernels::DhopSite(st,lo,U_v,UUU_v,st.CommBuf(),1,sU,in_v,out_v,0,1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
DhopComputeTime2 += usecond();
|
DhopComputeTime2 += usecond();
|
||||||
#else
|
|
||||||
assert(0);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -520,19 +478,11 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
|
|||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
DhopCommTime += usecond();
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
auto U_v = U.View();
|
|
||||||
auto UUU_v = UUU.View();
|
|
||||||
auto in_v = in.View();
|
|
||||||
auto out_v = out.View();
|
|
||||||
DhopComputeTime -= usecond();
|
DhopComputeTime -= usecond();
|
||||||
if (dag == DaggerYes) {
|
{
|
||||||
thread_for(sss, in.Grid()->oSites(),{
|
int interior=1;
|
||||||
Kernels::DhopSiteDag(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
int exterior=1;
|
||||||
});
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
} else {
|
|
||||||
thread_for(sss, in.Grid()->oSites(),{
|
|
||||||
Kernels::DhopSite(st, lo, U_v, UUU_v, st.CommBuf(), 1, sss, in_v, out_v);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
@ -600,6 +550,7 @@ template <class Impl>
|
|||||||
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
PropagatorField &q_in_2,
|
PropagatorField &q_in_2,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)
|
unsigned int mu)
|
||||||
{
|
{
|
||||||
@ -609,6 +560,7 @@ void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
|
@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View();
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View();
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View();
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
|||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View();
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & this->dee [0];
|
||||||
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & this->dee [0];
|
||||||
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
auto psi = psi_i.View();
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View();
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
|
@ -166,7 +166,7 @@ void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& c
|
|||||||
/*****************************************************************************************************/
|
/*****************************************************************************************************/
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
void MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
||||||
{
|
{
|
||||||
FermionField Din(psi.Grid());
|
FermionField Din(psi.Grid());
|
||||||
|
|
||||||
@ -174,11 +174,10 @@ RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
|
|||||||
this->DW(Din, chi, DaggerNo);
|
this->DW(Din, chi, DaggerNo);
|
||||||
axpby(chi, 1.0, 1.0, chi, psi);
|
axpby(chi, 1.0, 1.0, chi, psi);
|
||||||
this->M5D(psi, chi);
|
this->M5D(psi, chi);
|
||||||
return(norm2(chi));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
void MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
||||||
{
|
{
|
||||||
FermionField Din(psi.Grid());
|
FermionField Din(psi.Grid());
|
||||||
|
|
||||||
@ -186,7 +185,6 @@ RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
|
|||||||
this->MeooeDag5D(Din, chi);
|
this->MeooeDag5D(Din, chi);
|
||||||
this->M5Ddag(psi, chi);
|
this->M5Ddag(psi, chi);
|
||||||
axpby(chi, 1.0, 1.0, chi, psi);
|
axpby(chi, 1.0, 1.0, chi, psi);
|
||||||
return(norm2(chi));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/********************************************************************
|
/********************************************************************
|
||||||
|
@ -0,0 +1,499 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi, Peter Boyle
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
// Constructor and gauge import
|
||||||
|
/////////////////////////////////
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
||||||
|
RealD _mass,
|
||||||
|
RealD _c1, RealD _u0,
|
||||||
|
const ImplParams &p)
|
||||||
|
: Kernels(p),
|
||||||
|
_grid(&Fgrid),
|
||||||
|
_cbgrid(&Hgrid),
|
||||||
|
Stencil(&Fgrid, npoint, Even, directions, displacements,p),
|
||||||
|
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||||
|
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||||
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
|
Umu(&Fgrid),
|
||||||
|
UmuEven(&Hgrid),
|
||||||
|
UmuOdd(&Hgrid),
|
||||||
|
_tmp(&Hgrid)
|
||||||
|
{
|
||||||
|
int vol4;
|
||||||
|
int LLs=1;
|
||||||
|
c1=_c1;
|
||||||
|
u0=_u0;
|
||||||
|
vol4= _grid->oSites();
|
||||||
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
vol4= _cbgrid->oSites();
|
||||||
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
|
||||||
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
|
RealD _c1, RealD _u0,
|
||||||
|
const ImplParams &p)
|
||||||
|
: NaiveStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_u0,p)
|
||||||
|
{
|
||||||
|
ImportGauge(_U);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Momentum space propagator should be
|
||||||
|
// https://arxiv.org/pdf/hep-lat/9712010.pdf
|
||||||
|
//
|
||||||
|
// mom space action.
|
||||||
|
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
|
||||||
|
//
|
||||||
|
// must track through staggered flavour/spin reduction in literature to
|
||||||
|
// turn to free propagator for the one component chi field, a la page 4/5
|
||||||
|
// of above link to implmement fourier based solver.
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
||||||
|
{
|
||||||
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::ImportGauge(const GaugeField &_U)
|
||||||
|
{
|
||||||
|
GaugeLinkField U(GaugeGrid());
|
||||||
|
DoubledGaugeField _UUU(GaugeGrid());
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
// Double Store should take two fields for Naik and one hop separately.
|
||||||
|
// Discard teh Naik as Naive
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
Impl::DoubleStore(GaugeGrid(), _UUU, Umu, _U, _U );
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
// Apply scale factors to get the right fermion Kinetic term
|
||||||
|
// Could pass coeffs into the double store to save work.
|
||||||
|
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
|
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu, mu);
|
||||||
|
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
|
||||||
|
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu, mu+4);
|
||||||
|
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////
|
||||||
|
// Implement the interface
|
||||||
|
/////////////////////////////
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
Dhop(in, out, DaggerNo);
|
||||||
|
axpy(out, mass, in, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
Dhop(in, out, DaggerYes);
|
||||||
|
axpy(out, mass, in, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
||||||
|
if (in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerNo);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerNo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
||||||
|
if (in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerYes);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerYes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
typename FermionField::scalar_type scal(mass);
|
||||||
|
out = scal * in;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
Mooee(in, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
out = (1.0 / (mass)) * in;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
MooeeInv(in, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////
|
||||||
|
// Internal
|
||||||
|
///////////////////////////////////
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||||
|
GaugeField & mat,
|
||||||
|
const FermionField &A, const FermionField &B, int dag)
|
||||||
|
{
|
||||||
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
|
Compressor compressor;
|
||||||
|
|
||||||
|
FermionField Btilde(B.Grid());
|
||||||
|
FermionField Atilde(B.Grid());
|
||||||
|
Atilde = A;
|
||||||
|
|
||||||
|
st.HaloExchange(B, compressor);
|
||||||
|
|
||||||
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
|
|
||||||
|
////////////////////////
|
||||||
|
// Call the single hop
|
||||||
|
////////////////////////
|
||||||
|
autoView( U_v , U, CpuRead);
|
||||||
|
autoView( B_v , B, CpuWrite);
|
||||||
|
autoView( Btilde_v , Btilde, CpuWrite);
|
||||||
|
thread_for(sss,B.Grid()->oSites(),{
|
||||||
|
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||||
|
});
|
||||||
|
|
||||||
|
assert(0);// need to figure out the force interface with a blasted three link term.
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||||
|
|
||||||
|
conformable(U.Grid(), _grid);
|
||||||
|
conformable(U.Grid(), V.Grid());
|
||||||
|
conformable(U.Grid(), mat.Grid());
|
||||||
|
|
||||||
|
mat.Checkerboard() = U.Checkerboard();
|
||||||
|
|
||||||
|
DerivInternal(Stencil, Umu, mat, U, V, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||||
|
|
||||||
|
conformable(U.Grid(), _cbgrid);
|
||||||
|
conformable(U.Grid(), V.Grid());
|
||||||
|
conformable(U.Grid(), mat.Grid());
|
||||||
|
|
||||||
|
assert(V.Checkerboard() == Even);
|
||||||
|
assert(U.Checkerboard() == Odd);
|
||||||
|
mat.Checkerboard() = Odd;
|
||||||
|
|
||||||
|
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
||||||
|
|
||||||
|
conformable(U.Grid(), _cbgrid);
|
||||||
|
conformable(U.Grid(), V.Grid());
|
||||||
|
conformable(U.Grid(), mat.Grid());
|
||||||
|
|
||||||
|
assert(V.Checkerboard() == Odd);
|
||||||
|
assert(U.Checkerboard() == Even);
|
||||||
|
mat.Checkerboard() = Even;
|
||||||
|
|
||||||
|
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
DhopCalls+=2;
|
||||||
|
conformable(in.Grid(), _grid); // verifies full grid
|
||||||
|
conformable(in.Grid(), out.Grid());
|
||||||
|
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
|
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
DhopCalls+=1;
|
||||||
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
|
assert(in.Checkerboard() == Even);
|
||||||
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
|
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
DhopCalls+=1;
|
||||||
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
|
assert(in.Checkerboard() == Odd);
|
||||||
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
|
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp)
|
||||||
|
{
|
||||||
|
DhopDir(in, out, dir, disp);
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::MdirAll(const FermionField &in, std::vector<FermionField> &out)
|
||||||
|
{
|
||||||
|
assert(0); // Not implemented yet
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp)
|
||||||
|
{
|
||||||
|
|
||||||
|
Compressor compressor;
|
||||||
|
Stencil.HaloExchange(in, compressor);
|
||||||
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
|
autoView( in_v , in, CpuRead);
|
||||||
|
autoView( out_v , out, CpuWrite);
|
||||||
|
// thread_for( sss, in.Grid()->oSites(),{
|
||||||
|
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||||
|
// });
|
||||||
|
assert(0);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
|
else
|
||||||
|
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
Compressor compressor;
|
||||||
|
int len = U.Grid()->oSites();
|
||||||
|
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
DhopFaceTime += usecond();
|
||||||
|
|
||||||
|
DhopCommTime -=usecond();
|
||||||
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
|
st.CommunicateBegin(requests);
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
|
st.CommsMergeSHM(compressor);
|
||||||
|
DhopFaceTime+= usecond();
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Removed explicit thread comms
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
|
{
|
||||||
|
int interior=1;
|
||||||
|
int exterior=0;
|
||||||
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
|
||||||
|
st.CommunicateComplete(requests);
|
||||||
|
DhopCommTime +=usecond();
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
st.CommsMerge(compressor);
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
|
||||||
|
DhopComputeTime2 -= usecond();
|
||||||
|
{
|
||||||
|
int interior=0;
|
||||||
|
int exterior=1;
|
||||||
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
|
}
|
||||||
|
DhopComputeTime2 += usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
|
||||||
|
DhopCommTime -= usecond();
|
||||||
|
Compressor compressor;
|
||||||
|
st.HaloExchange(in, compressor);
|
||||||
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
|
{
|
||||||
|
int interior=1;
|
||||||
|
int exterior=1;
|
||||||
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
DhopTotalTime += usecond();
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Reporting
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
template<class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::Report(void)
|
||||||
|
{
|
||||||
|
Coordinate latt = _grid->GlobalDimensions();
|
||||||
|
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||||
|
RealD NP = _grid->_Nprocessors;
|
||||||
|
RealD NN = _grid->NodeCount();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
|
||||||
|
<< DhopCalls << std::endl;
|
||||||
|
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
|
||||||
|
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
|
||||||
|
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
|
||||||
|
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||||
|
|
||||||
|
// Average the compute time
|
||||||
|
_grid->GlobalSum(DhopComputeTime);
|
||||||
|
DhopComputeTime/=NP;
|
||||||
|
|
||||||
|
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||||
|
|
||||||
|
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
||||||
|
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||||
|
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||||
|
}
|
||||||
|
template<class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
|
||||||
|
{
|
||||||
|
DhopCalls = 0;
|
||||||
|
DhopTotalTime = 0;
|
||||||
|
DhopCommTime = 0;
|
||||||
|
DhopComputeTime = 0;
|
||||||
|
DhopFaceTime = 0;
|
||||||
|
|
||||||
|
Stencil.ZeroCounters();
|
||||||
|
StencilEven.ZeroCounters();
|
||||||
|
StencilOdd.ZeroCounters();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
// Conserved current - not yet implemented.
|
||||||
|
////////////////////////////////////////////////////////
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
|
PropagatorField &q_in_2,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void NaiveStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||||
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
|
Current curr_type,
|
||||||
|
unsigned int mu,
|
||||||
|
unsigned int tmin,
|
||||||
|
unsigned int tmax,
|
||||||
|
ComplexField &lattice_cmplx)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -269,16 +269,14 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD PartialFractionFermion5D<Impl>::M (const FermionField &in, FermionField &out)
|
void PartialFractionFermion5D<Impl>::M (const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
M_internal(in,out,DaggerNo);
|
M_internal(in,out,DaggerNo);
|
||||||
return norm2(out);
|
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
RealD PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
|
void PartialFractionFermion5D<Impl>::Mdag (const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
M_internal(in,out,DaggerYes);
|
M_internal(in,out,DaggerYes);
|
||||||
return norm2(out);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
|
@ -618,10 +618,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -680,12 +680,13 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
||||||
gauge3 =(uint64_t)&UU[sU]( T );
|
gauge3 =(uint64_t)&UU[sU]( T );
|
||||||
|
|
||||||
|
|
||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
#include <Grid/simd/Intel512single.h>
|
#include <Grid/simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -702,9 +703,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
|
|
||||||
int sF=s+LLs*sU;
|
// int sF=s+LLs*sU;
|
||||||
|
{
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
@ -736,10 +738,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
}
|
}
|
||||||
|
|
||||||
#include <Grid/simd/Intel512double.h>
|
#include <Grid/simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -756,8 +758,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
int sF=s+LLs*sU;
|
// int sF=s+LLs*sU;
|
||||||
|
{
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHI(addr0,addr1,addr2,addr3);
|
LOAD_CHI(addr0,addr1,addr2,addr3);
|
||||||
@ -821,10 +824,10 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
|
|
||||||
#include <Grid/simd/Intel512single.h>
|
#include <Grid/simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -841,9 +844,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
|
// int sF=s+LLs*sU;
|
||||||
int sF=s+LLs*sU;
|
{
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHIa(addr0,addr1);
|
||||||
@ -890,10 +893,10 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#include <Grid/simd/Intel512double.h>
|
#include <Grid/simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,
|
DoubledGaugeFieldView &U,
|
||||||
DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
int sU, const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
@ -910,9 +913,9 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
StencilEntry *SE2;
|
StencilEntry *SE2;
|
||||||
StencilEntry *SE3;
|
StencilEntry *SE3;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
|
// int sF=s+LLs*sU;
|
||||||
int sF=s+LLs*sU;
|
{
|
||||||
// Xp, Yp, Zp, Tp
|
// Xp, Yp, Zp, Tp
|
||||||
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
PREPARE(Xp,Yp,Zp,Tp,0,U);
|
||||||
LOAD_CHIa(addr0,addr1);
|
LOAD_CHIa(addr0,addr1);
|
||||||
|
@ -146,9 +146,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
template <int Naik>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
@ -181,8 +182,9 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
int sF=s+LLs*sU;
|
// int sF=s+LLs*sU;
|
||||||
|
{
|
||||||
|
|
||||||
skew = 0;
|
skew = 0;
|
||||||
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||||
@ -193,6 +195,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||||
|
if (Naik) {
|
||||||
skew = 8;
|
skew = 8;
|
||||||
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||||
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||||
@ -202,7 +205,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||||
|
}
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
result()()(0) = - even_0 - odd_0;
|
result()()(0) = - even_0 - odd_0;
|
||||||
result()()(1) = - even_1 - odd_1;
|
result()()(1) = - even_1 - odd_1;
|
||||||
@ -218,9 +221,10 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
template <int Naik>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
@ -253,8 +257,9 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
int sF=s+LLs*sU;
|
// int sF=s+LLs*sU;
|
||||||
|
{
|
||||||
|
|
||||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||||
@ -268,6 +273,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||||
|
if (Naik) {
|
||||||
skew = 8;
|
skew = 8;
|
||||||
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||||
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||||
@ -277,7 +283,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||||
|
}
|
||||||
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
result()()(0) = - even_0 - odd_0;
|
result()()(0) = - even_0 - odd_0;
|
||||||
@ -294,9 +300,10 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
template <int Naik>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag)
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
@ -329,8 +336,9 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
int sF=s+LLs*sU;
|
// int sF=s+LLs*sU;
|
||||||
|
{
|
||||||
|
|
||||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
||||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
||||||
@ -344,6 +352,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||||
|
if (Naik) {
|
||||||
skew = 8;
|
skew = 8;
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||||
@ -353,7 +362,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||||
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||||
|
}
|
||||||
// Add sum of all exterior connected stencil legs
|
// Add sum of all exterior connected stencil legs
|
||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
@ -370,6 +379,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
@ -385,7 +395,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
|
*/
|
||||||
#undef LOAD_CHI
|
#undef LOAD_CHI
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -78,10 +78,12 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
|||||||
// Int, Ext, Int+Ext cases for comms overlap
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
template <int Naik>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag) {
|
const FermionFieldView &in, FermionFieldView &out, int dag)
|
||||||
|
{
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
SiteSpinor Uchi;
|
SiteSpinor Uchi;
|
||||||
@ -89,8 +91,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
int ptype;
|
int ptype;
|
||||||
int skew;
|
int skew;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
int sF=LLs*sU+s;
|
//
|
||||||
|
// int sF=LLs*sU+s;
|
||||||
|
{
|
||||||
skew = 0;
|
skew = 0;
|
||||||
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||||
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -100,6 +104,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( Naik ) {
|
||||||
skew=8;
|
skew=8;
|
||||||
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -109,6 +114,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
}
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
Uchi = - Uchi;
|
Uchi = - Uchi;
|
||||||
}
|
}
|
||||||
@ -120,9 +126,10 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
// Only contributions from interior of our node
|
// Only contributions from interior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
template <int Naik>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
@ -131,8 +138,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
|||||||
int ptype;
|
int ptype;
|
||||||
int skew ;
|
int skew ;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
int sF=LLs*sU+s;
|
// int sF=LLs*sU+s;
|
||||||
|
{
|
||||||
skew = 0;
|
skew = 0;
|
||||||
Uchi=Zero();
|
Uchi=Zero();
|
||||||
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
@ -143,6 +151,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
|||||||
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( Naik ) {
|
||||||
skew=8;
|
skew=8;
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -152,6 +161,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
|||||||
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
}
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
Uchi = - Uchi;
|
Uchi = - Uchi;
|
||||||
}
|
}
|
||||||
@ -164,9 +174,10 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &
|
|||||||
// Only contributions from exterior of our node
|
// Only contributions from exterior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
template <int Naik>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
// SiteSpinor chi;
|
// SiteSpinor chi;
|
||||||
@ -176,8 +187,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
|||||||
int nmu=0;
|
int nmu=0;
|
||||||
int skew ;
|
int skew ;
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
int sF=LLs*sU+s;
|
// int sF=LLs*sU+s;
|
||||||
|
{
|
||||||
skew = 0;
|
skew = 0;
|
||||||
Uchi=Zero();
|
Uchi=Zero();
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
@ -188,6 +200,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
|||||||
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( Naik ) {
|
||||||
skew=8;
|
skew=8;
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
@ -197,7 +210,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
|||||||
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
}
|
||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
out[sF] = out[sF] - Uchi;
|
out[sF] = out[sF] - Uchi;
|
||||||
@ -211,72 +224,9 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &
|
|||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Driving / wrapping routine to select right kernel
|
// Driving / wrapping routine to select right kernel
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
|
||||||
const FermionFieldView &in, FermionFieldView &out,
|
|
||||||
int interior,int exterior)
|
|
||||||
{
|
|
||||||
int dag=1;
|
|
||||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
|
||||||
const FermionFieldView &in, FermionFieldView &out,
|
|
||||||
int interior,int exterior)
|
|
||||||
{
|
|
||||||
int dag=0;
|
|
||||||
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
|
||||||
SiteSpinor *buf, int LLs,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out,
|
|
||||||
int dag,int interior,int exterior)
|
|
||||||
{
|
|
||||||
switch(Opt) {
|
|
||||||
#ifdef AVX512
|
|
||||||
case OptInlineAsm:
|
|
||||||
if ( interior && exterior ) {
|
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else {
|
|
||||||
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
#endif
|
|
||||||
case OptHandUnroll:
|
|
||||||
if ( interior && exterior ) {
|
|
||||||
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else if ( interior ) {
|
|
||||||
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else if ( exterior ) {
|
|
||||||
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case OptGeneric:
|
|
||||||
if ( interior && exterior ) {
|
|
||||||
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else if ( interior ) {
|
|
||||||
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
} else if ( exterior ) {
|
|
||||||
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
|
|
||||||
{
|
{
|
||||||
// Disp should be either +1,-1,+3,-3
|
// Disp should be either +1,-1,+3,-3
|
||||||
// What about "dag" ?
|
// What about "dag" ?
|
||||||
@ -285,6 +235,108 @@ void StaggeredKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeFieldVi
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define KERNEL_CALLNB(A,improved) \
|
||||||
|
const uint64_t NN = Nsite*Ls; \
|
||||||
|
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
||||||
|
int sF = ss; \
|
||||||
|
int sU = ss/Ls; \
|
||||||
|
ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
||||||
|
});
|
||||||
|
|
||||||
|
#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
|
||||||
|
|
||||||
|
#define ASM_CALL(A) \
|
||||||
|
const uint64_t NN = Nsite*Ls; \
|
||||||
|
thread_for( ss, NN, { \
|
||||||
|
int sF = ss; \
|
||||||
|
int sU = ss/Ls; \
|
||||||
|
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
||||||
|
});
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
|
{
|
||||||
|
GridBase *FGrid=in.Grid();
|
||||||
|
GridBase *UGrid=U.Grid();
|
||||||
|
typedef StaggeredKernels<Impl> ThisKernel;
|
||||||
|
autoView( UUU_v , UUU, AcceleratorRead);
|
||||||
|
autoView( U_v , U, AcceleratorRead);
|
||||||
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
|
autoView( st_v , st, AcceleratorRead);
|
||||||
|
SiteSpinor * buf = st.CommBuf();
|
||||||
|
|
||||||
|
int Ls=1;
|
||||||
|
if(FGrid->Nd()==UGrid->Nd()+1){
|
||||||
|
Ls = FGrid->_rdimensions[0];
|
||||||
|
}
|
||||||
|
int Nsite = UGrid->oSites();
|
||||||
|
|
||||||
|
if( interior && exterior ) {
|
||||||
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
|
||||||
|
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
|
||||||
|
#endif
|
||||||
|
} else if( interior ) {
|
||||||
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
|
||||||
|
#endif
|
||||||
|
} else if( exterior ) {
|
||||||
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
|
{
|
||||||
|
GridBase *FGrid=in.Grid();
|
||||||
|
GridBase *UGrid=U.Grid();
|
||||||
|
typedef StaggeredKernels<Impl> ThisKernel;
|
||||||
|
autoView( UUU_v , U, AcceleratorRead);
|
||||||
|
autoView( U_v , U, AcceleratorRead);
|
||||||
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
|
autoView( st_v , st, AcceleratorRead);
|
||||||
|
SiteSpinor * buf = st.CommBuf();
|
||||||
|
|
||||||
|
int Ls=1;
|
||||||
|
if(FGrid->Nd()==UGrid->Nd()+1){
|
||||||
|
Ls = FGrid->_rdimensions[0];
|
||||||
|
}
|
||||||
|
int Nsite = UGrid->oSites();
|
||||||
|
|
||||||
|
if( interior && exterior ) {
|
||||||
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
|
||||||
|
#endif
|
||||||
|
} else if( interior ) {
|
||||||
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
|
||||||
|
#endif
|
||||||
|
} else if( exterior ) {
|
||||||
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef KERNEL_CALLNB
|
||||||
|
#undef KERNEL_CALL
|
||||||
|
#undef ASM_CALL
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
// *NOT* EO
|
// *NOT* EO
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
|
void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
FermionField temp(out.Grid());
|
FermionField temp(out.Grid());
|
||||||
|
|
||||||
@ -47,11 +47,10 @@ RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
|
|||||||
Mooee(in, temp);
|
Mooee(in, temp);
|
||||||
|
|
||||||
out += temp;
|
out += temp;
|
||||||
return norm2(out);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
FermionField temp(out.Grid());
|
FermionField temp(out.Grid());
|
||||||
|
|
||||||
@ -63,7 +62,6 @@ RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
|||||||
MooeeDag(in, temp);
|
MooeeDag(in, temp);
|
||||||
|
|
||||||
out += temp;
|
out += temp;
|
||||||
return norm2(out);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -100,11 +98,13 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
Coordinate lcoor;
|
Coordinate lcoor;
|
||||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||||
|
|
||||||
for (int site = 0; site < lvol; site++)
|
|
||||||
{
|
{
|
||||||
|
autoView(CTv,CloverTerm,CpuRead);
|
||||||
|
autoView(CTIv,CloverTermInv,CpuWrite);
|
||||||
|
for (int site = 0; site < lvol; site++) {
|
||||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||||
peekLocalSite(Qx, CloverTerm, lcoor);
|
peekLocalSite(Qx, CTv, lcoor);
|
||||||
Qxinv = Zero();
|
Qxinv = Zero();
|
||||||
//if (csw!=0){
|
//if (csw!=0){
|
||||||
for (int j = 0; j < Ns; j++)
|
for (int j = 0; j < Ns; j++)
|
||||||
@ -125,21 +125,22 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
|
||||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||||
// }
|
// }
|
||||||
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
|
pokeLocalSite(Qxinv, CTIv, lcoor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Separate the even and odd parts
|
// Separate the even and odd parts
|
||||||
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
||||||
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
||||||
|
|
||||||
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
|
pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
|
||||||
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
|
pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
|
||||||
|
|
||||||
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
|
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
|
||||||
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
|
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
|
||||||
|
|
||||||
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
|
pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
|
||||||
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
|
pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
|
@ -580,16 +580,21 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
|
|||||||
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
||||||
|
|
||||||
// FIXME Need a Lattice acosh
|
// FIXME Need a Lattice acosh
|
||||||
|
|
||||||
|
{
|
||||||
|
autoView(cosha_v,cosha,CpuRead);
|
||||||
|
autoView(a_v,a,CpuWrite);
|
||||||
for(int idx=0;idx<_grid->lSites();idx++){
|
for(int idx=0;idx<_grid->lSites();idx++){
|
||||||
Coordinate lcoor(Nd);
|
Coordinate lcoor(Nd);
|
||||||
Tcomplex cc;
|
Tcomplex cc;
|
||||||
// RealD sgn;
|
// RealD sgn;
|
||||||
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
peekLocalSite(cc,cosha,lcoor);
|
peekLocalSite(cc,cosha_v,lcoor);
|
||||||
assert((double)real(cc)>=1.0);
|
assert((double)real(cc)>=1.0);
|
||||||
assert(fabs((double)imag(cc))<=1.0e-15);
|
assert(fabs((double)imag(cc))<=1.0e-15);
|
||||||
cc = ScalComplex(::acosh(real(cc)),0.0);
|
cc = ScalComplex(::acosh(real(cc)),0.0);
|
||||||
pokeLocalSite(cc,a,lcoor);
|
pokeLocalSite(cc,a_v,lcoor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Wea = ( exp( a) * abs(W) );
|
Wea = ( exp( a) * abs(W) );
|
||||||
@ -775,17 +780,20 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
|
|||||||
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
cosha = (one + W*W + sk) / (abs(W)*2.0);
|
||||||
|
|
||||||
// FIXME Need a Lattice acosh
|
// FIXME Need a Lattice acosh
|
||||||
|
{
|
||||||
|
autoView(cosha_v,cosha,CpuRead);
|
||||||
|
autoView(a_v,a,CpuWrite);
|
||||||
for(int idx=0;idx<_grid->lSites();idx++){
|
for(int idx=0;idx<_grid->lSites();idx++){
|
||||||
Coordinate lcoor(Nd);
|
Coordinate lcoor(Nd);
|
||||||
Tcomplex cc;
|
Tcomplex cc;
|
||||||
// RealD sgn;
|
// RealD sgn;
|
||||||
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
_grid->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
peekLocalSite(cc,cosha,lcoor);
|
peekLocalSite(cc,cosha_v,lcoor);
|
||||||
assert((double)real(cc)>=1.0);
|
assert((double)real(cc)>=1.0);
|
||||||
assert(fabs((double)imag(cc))<=1.0e-15);
|
assert(fabs((double)imag(cc))<=1.0e-15);
|
||||||
cc = ScalComplex(::acosh(real(cc)),0.0);
|
cc = ScalComplex(::acosh(real(cc)),0.0);
|
||||||
pokeLocalSite(cc,a,lcoor);
|
pokeLocalSite(cc,a_v,lcoor);
|
||||||
}
|
}}
|
||||||
|
|
||||||
Wea = ( exp( a) * abs(W) );
|
Wea = ( exp( a) * abs(W) );
|
||||||
Wema= ( exp(-a) * abs(W) );
|
Wema= ( exp(-a) * abs(W) );
|
||||||
@ -861,7 +869,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
|||||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||||
* to make a conserved current sink or inserting the conserved current
|
* to make a conserved current sink or inserting the conserved current
|
||||||
* sequentially.
|
* sequentially.
|
||||||
******************************************************************************/
|
|
||||||
|
|
||||||
// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
|
// Helper macro to reverse Simd vector. Fixme: slow, generic implementation.
|
||||||
#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
|
#define REVERSE_LS(qSite, qSiteRev, Nsimd) \
|
||||||
@ -877,220 +884,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
|||||||
merge(qSiteRev, qSiteVec); \
|
merge(qSiteRev, qSiteVec); \
|
||||||
}
|
}
|
||||||
|
|
||||||
// psi = chiralProjectPlus(Result_s[Ls/2-1]);
|
******************************************************************************/
|
||||||
// psi+= chiralProjectMinus(Result_s[Ls/2]);
|
|
||||||
// PJ5q+=localInnerProduct(psi,psi);
|
|
||||||
|
|
||||||
template<class vobj>
|
|
||||||
Lattice<vobj> spProj5p(const Lattice<vobj> & in)
|
|
||||||
{
|
|
||||||
GridBase *grid=in.Grid();
|
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
|
||||||
Lattice<vobj> ret(grid);
|
|
||||||
auto ret_v = ret.View();
|
|
||||||
auto in_v = in.View();
|
|
||||||
thread_for(ss,grid->oSites(),{
|
|
||||||
ret_v[ss] = in_v[ss] + G5*in_v[ss];
|
|
||||||
});
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
template<class vobj>
|
|
||||||
Lattice<vobj> spProj5m(const Lattice<vobj> & in)
|
|
||||||
{
|
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
|
||||||
GridBase *grid=in.Grid();
|
|
||||||
Lattice<vobj> ret(grid);
|
|
||||||
auto ret_v = ret.View();
|
|
||||||
auto in_v = in.View();
|
|
||||||
thread_for(ss,grid->oSites(),{
|
|
||||||
ret_v[ss] = in_v[ss] - G5*in_v[ss];
|
|
||||||
});
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::ContractJ5q(FermionField &q_in,ComplexField &J5q)
|
|
||||||
{
|
|
||||||
conformable(GaugeGrid(), J5q.Grid());
|
|
||||||
conformable(q_in.Grid(), FermionGrid());
|
|
||||||
|
|
||||||
// 4d field
|
|
||||||
int Ls = this->Ls;
|
|
||||||
FermionField psi(GaugeGrid());
|
|
||||||
FermionField p_plus (GaugeGrid());
|
|
||||||
FermionField p_minus(GaugeGrid());
|
|
||||||
FermionField p(GaugeGrid());
|
|
||||||
|
|
||||||
ExtractSlice(p_plus , q_in, Ls/2 , 0);
|
|
||||||
ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
|
|
||||||
p_plus = spProj5p(p_plus );
|
|
||||||
p_minus= spProj5m(p_minus);
|
|
||||||
p=p_plus+p_minus;
|
|
||||||
J5q = localInnerProduct(p,p);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::ContractJ5q(PropagatorField &q_in,ComplexField &J5q)
|
|
||||||
{
|
|
||||||
conformable(GaugeGrid(), J5q.Grid());
|
|
||||||
conformable(q_in.Grid(), FermionGrid());
|
|
||||||
|
|
||||||
// 4d field
|
|
||||||
int Ls = this->Ls;
|
|
||||||
PropagatorField psi(GaugeGrid());
|
|
||||||
PropagatorField p_plus (GaugeGrid());
|
|
||||||
PropagatorField p_minus(GaugeGrid());
|
|
||||||
PropagatorField p(GaugeGrid());
|
|
||||||
|
|
||||||
ExtractSlice(p_plus , q_in, Ls/2 , 0);
|
|
||||||
ExtractSlice(p_minus, q_in, Ls/2-1 , 0);
|
|
||||||
p_plus = spProj5p(p_plus );
|
|
||||||
p_minus= spProj5m(p_minus);
|
|
||||||
p=p_plus+p_minus;
|
|
||||||
J5q = localInnerProduct(p,p);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|
||||||
PropagatorField &q_in_2,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu)
|
|
||||||
{
|
|
||||||
conformable(q_in_1.Grid(), FermionGrid());
|
|
||||||
conformable(q_in_1.Grid(), q_in_2.Grid());
|
|
||||||
conformable(_FourDimGrid, q_out.Grid());
|
|
||||||
|
|
||||||
PropagatorField tmp1(FermionGrid()), tmp2(FermionGrid());
|
|
||||||
unsigned int LLs = q_in_1.Grid()->_rdimensions[0];
|
|
||||||
q_out = Zero();
|
|
||||||
|
|
||||||
// Forward, need q1(x + mu, s), q2(x, Ls - 1 - s). Backward, need q1(x, s),
|
|
||||||
// q2(x + mu, Ls - 1 - s). 5D lattice so shift 4D coordinate mu by one.
|
|
||||||
tmp1 = Cshift(q_in_1, mu + 1, 1);
|
|
||||||
tmp2 = Cshift(q_in_2, mu + 1, 1);
|
|
||||||
auto q_in_1_v = q_in_1.View();
|
|
||||||
auto q_in_2_v = q_in_2.View();
|
|
||||||
auto tmp1_v = tmp1.View();
|
|
||||||
auto tmp2_v = tmp2.View();
|
|
||||||
auto q_out_v = q_out.View();
|
|
||||||
auto Umu_v = Umu.View();
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(),{
|
|
||||||
|
|
||||||
unsigned int sF1 = sU * LLs;
|
|
||||||
unsigned int sF2 = (sU + 1) * LLs - 1;
|
|
||||||
|
|
||||||
for (unsigned int s = 0; s < LLs; ++s)
|
|
||||||
{
|
|
||||||
bool axial_sign = ((curr_type == Current::Axial) && \
|
|
||||||
(s < (LLs / 2)));
|
|
||||||
SitePropagator qSite2, qmuSite2;
|
|
||||||
|
|
||||||
// If vectorised in 5th dimension, reverse q2 vector to match up
|
|
||||||
// sites correctly.
|
|
||||||
if (Impl::LsVectorised)
|
|
||||||
{
|
|
||||||
REVERSE_LS(q_in_2_v[sF2], qSite2, Ls / LLs);
|
|
||||||
REVERSE_LS(tmp2_v[sF2], qmuSite2, Ls / LLs);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
qSite2 = q_in_2_v[sF2];
|
|
||||||
qmuSite2 = tmp2_v[sF2];
|
|
||||||
}
|
|
||||||
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sF1],
|
|
||||||
qSite2,
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu, axial_sign);
|
|
||||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sF1],
|
|
||||||
qmuSite2,
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu, axial_sign);
|
|
||||||
sF1++;
|
|
||||||
sF2--;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|
||||||
PropagatorField &q_out,
|
|
||||||
Current curr_type,
|
|
||||||
unsigned int mu,
|
|
||||||
unsigned int tmin,
|
|
||||||
unsigned int tmax,
|
|
||||||
ComplexField &lattice_cmplx)
|
|
||||||
{
|
|
||||||
conformable(q_in.Grid(), FermionGrid());
|
|
||||||
conformable(q_in.Grid(), q_out.Grid());
|
|
||||||
PropagatorField tmp(GaugeGrid()),tmp2(GaugeGrid());
|
|
||||||
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
|
||||||
unsigned int LLs = q_in.Grid()->_rdimensions[0];
|
|
||||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
|
||||||
|
|
||||||
q_out = Zero();
|
|
||||||
LatticeInteger coords(_FourDimGrid);
|
|
||||||
LatticeCoordinate(coords, Tp);
|
|
||||||
|
|
||||||
auto q_out_v = q_out.View();
|
|
||||||
auto tmp2_v = tmp2.View();
|
|
||||||
auto coords_v= coords.View();
|
|
||||||
auto Umu_v = Umu.View();
|
|
||||||
for (unsigned int s = 0; s < LLs; ++s)
|
|
||||||
{
|
|
||||||
bool axial_sign = ((curr_type == Current::Axial) && (s < (LLs / 2)));
|
|
||||||
bool tadpole_sign = (curr_type == Current::Tadpole);
|
|
||||||
bool switch_sgn = tadpole_sign || axial_sign;
|
|
||||||
|
|
||||||
|
|
||||||
//forward direction: Need q(x + mu, s)*A(x)
|
|
||||||
ExtractSlice(tmp2, q_in, s, 0); //q(x,s)
|
|
||||||
tmp = Cshift(tmp2, mu, 1); //q(x+mu,s)
|
|
||||||
tmp2 = tmp*lattice_cmplx; //q(x+mu,s)*A(x)
|
|
||||||
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(),{
|
|
||||||
// Compute the sequential conserved current insertion only if our simd
|
|
||||||
// object contains a timeslice we need.
|
|
||||||
vPredicate t_mask;
|
|
||||||
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
|
|
||||||
Integer timeSlices = Reduce(t_mask());
|
|
||||||
|
|
||||||
if (timeSlices > 0)
|
|
||||||
{
|
|
||||||
unsigned int sF = sU * LLs + s;
|
|
||||||
Kernels::SeqConservedCurrentSiteFwd(tmp2_v[sU],
|
|
||||||
q_out_v[sF], Umu_v, sU,
|
|
||||||
mu, t_mask, switch_sgn);
|
|
||||||
}
|
|
||||||
|
|
||||||
});
|
|
||||||
|
|
||||||
//backward direction: Need q(x - mu, s)*A(x-mu)
|
|
||||||
ExtractSlice(tmp2, q_in, s, 0); //q(x,s)
|
|
||||||
tmp = lattice_cmplx*tmp2; //q(x,s)*A(x)
|
|
||||||
tmp2 = Cshift(tmp, mu, -1); //q(x-mu,s)*A(x-mu,s)
|
|
||||||
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(),
|
|
||||||
{
|
|
||||||
vPredicate t_mask;
|
|
||||||
t_mask()= ((coords_v[sU] >= (tmin + tshift)) && (coords_v[sU] <= (tmax + tshift)));
|
|
||||||
|
|
||||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
|
||||||
unsigned int t0 = 0;
|
|
||||||
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
|
|
||||||
|
|
||||||
Integer timeSlices = Reduce(t_mask());
|
|
||||||
|
|
||||||
if (timeSlices > 0) {
|
|
||||||
unsigned int sF = sU * LLs + s;
|
|
||||||
Kernels::SeqConservedCurrentSiteBwd(tmp2_v[sU],
|
|
||||||
q_out_v[sF], Umu_v, sU,
|
|
||||||
mu, t_mask, axial_sign);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -67,7 +67,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
diag_mass = 4.0 + mass;
|
diag_mass = 4.0 + mass;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int vol4;
|
||||||
|
vol4=Fgrid.oSites();
|
||||||
|
Stencil.BuildSurfaceList(1,vol4);
|
||||||
|
vol4=Hgrid.oSites();
|
||||||
|
StencilEven.BuildSurfaceList(1,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(1,vol4);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -187,21 +192,24 @@ void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::M(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerNo);
|
Dhop(in, out, DaggerNo);
|
||||||
return axpy_norm(out, diag_mass, in, out);
|
axpy(out, diag_mass, in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Dhop(in, out, DaggerYes);
|
Dhop(in, out, DaggerYes);
|
||||||
return axpy_norm(out, diag_mass, in, out);
|
axpy(out, diag_mass, in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerNo);
|
DhopEO(in, out, DaggerNo);
|
||||||
} else {
|
} else {
|
||||||
@ -210,7 +218,8 @@ void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
if (in.Checkerboard() == Odd) {
|
if (in.Checkerboard() == Odd) {
|
||||||
DhopEO(in, out, DaggerYes);
|
DhopEO(in, out, DaggerYes);
|
||||||
} else {
|
} else {
|
||||||
@ -219,26 +228,30 @@ void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
typename FermionField::scalar_type scal(diag_mass);
|
typename FermionField::scalar_type scal(diag_mass);
|
||||||
out = scal * in;
|
out = scal * in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
Mooee(in, out);
|
Mooee(in, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
out = (1.0/(diag_mass))*in;
|
out = (1.0/(diag_mass))*in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
|
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
MooeeInv(in,out);
|
MooeeInv(in,out);
|
||||||
}
|
}
|
||||||
@ -341,7 +354,8 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
|
{
|
||||||
conformable(U.Grid(), _grid);
|
conformable(U.Grid(), _grid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
conformable(U.Grid(), mat.Grid());
|
conformable(U.Grid(), mat.Grid());
|
||||||
@ -352,7 +366,8 @@ void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, cons
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
|
{
|
||||||
conformable(U.Grid(), _cbgrid);
|
conformable(U.Grid(), _cbgrid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
//conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
|
//conformable(U.Grid(), mat.Grid()); not general, leaving as a comment (Guido)
|
||||||
@ -366,7 +381,8 @@ void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, co
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
|
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
|
||||||
|
{
|
||||||
conformable(U.Grid(), _cbgrid);
|
conformable(U.Grid(), _cbgrid);
|
||||||
conformable(U.Grid(), V.Grid());
|
conformable(U.Grid(), V.Grid());
|
||||||
//conformable(U.Grid(), mat.Grid());
|
//conformable(U.Grid(), mat.Grid());
|
||||||
@ -379,8 +395,8 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
||||||
DhopCalls+=2;
|
{
|
||||||
conformable(in.Grid(), _grid); // verifies full grid
|
conformable(in.Grid(), _grid); // verifies full grid
|
||||||
conformable(in.Grid(), out.Grid());
|
conformable(in.Grid(), out.Grid());
|
||||||
|
|
||||||
@ -390,8 +406,8 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
||||||
DhopCalls+=1;
|
{
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
@ -402,8 +418,8 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
|
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
DhopCalls+=1;
|
{
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
@ -482,7 +498,8 @@ template <class Impl>
|
|||||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag) {
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
@ -547,7 +564,8 @@ template <class Impl>
|
|||||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag) {
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
DhopCommTime-=usecond();
|
DhopCommTime-=usecond();
|
||||||
@ -574,6 +592,7 @@ template <class Impl>
|
|||||||
void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
||||||
PropagatorField &q_in_2,
|
PropagatorField &q_in_2,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)
|
unsigned int mu)
|
||||||
{
|
{
|
||||||
@ -581,35 +600,14 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|||||||
conformable(_grid, q_in_1.Grid());
|
conformable(_grid, q_in_1.Grid());
|
||||||
conformable(_grid, q_in_2.Grid());
|
conformable(_grid, q_in_2.Grid());
|
||||||
conformable(_grid, q_out.Grid());
|
conformable(_grid, q_out.Grid());
|
||||||
PropagatorField tmp1(_grid), tmp2(_grid);
|
assert(0);
|
||||||
q_out = Zero();
|
|
||||||
|
|
||||||
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
|
|
||||||
// Inefficient comms method but not performance critical.
|
|
||||||
tmp1 = Cshift(q_in_1, mu, 1);
|
|
||||||
tmp2 = Cshift(q_in_2, mu, 1);
|
|
||||||
auto tmp1_v = tmp1.View();
|
|
||||||
auto tmp2_v = tmp2.View();
|
|
||||||
auto q_in_1_v=q_in_1.View();
|
|
||||||
auto q_in_2_v=q_in_2.View();
|
|
||||||
auto q_out_v = q_out.View();
|
|
||||||
auto Umu_v = Umu.View();
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(),{
|
|
||||||
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
|
|
||||||
q_in_2_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu);
|
|
||||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
|
|
||||||
tmp2_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
||||||
PropagatorField &q_out,
|
PropagatorField &q_out,
|
||||||
|
PropagatorField &src,
|
||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu,
|
unsigned int mu,
|
||||||
unsigned int tmin,
|
unsigned int tmin,
|
||||||
@ -618,59 +616,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
{
|
{
|
||||||
conformable(_grid, q_in.Grid());
|
conformable(_grid, q_in.Grid());
|
||||||
conformable(_grid, q_out.Grid());
|
conformable(_grid, q_out.Grid());
|
||||||
|
assert(0);
|
||||||
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
|
|
||||||
Complex i(0.0,1.0);
|
|
||||||
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
|
|
||||||
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
|
||||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
|
||||||
|
|
||||||
q_out = Zero();
|
|
||||||
LatticeInteger coords(_grid);
|
|
||||||
LatticeCoordinate(coords, Tp);
|
|
||||||
|
|
||||||
// Need q(x + mu) and q(x - mu).
|
|
||||||
tmp = Cshift(q_in, mu, 1);
|
|
||||||
tmpFwd = tmp*lattice_cmplx;
|
|
||||||
tmp = lattice_cmplx*q_in;
|
|
||||||
tmpBwd = Cshift(tmp, mu, -1);
|
|
||||||
|
|
||||||
auto coords_v = coords.View();
|
|
||||||
auto tmpFwd_v = tmpFwd.View();
|
|
||||||
auto tmpBwd_v = tmpBwd.View();
|
|
||||||
auto Umu_v = Umu.View();
|
|
||||||
auto q_out_v = q_out.View();
|
|
||||||
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(), {
|
|
||||||
|
|
||||||
// Compute the sequential conserved current insertion only if our simd
|
|
||||||
// object contains a timeslice we need.
|
|
||||||
vPredicate t_mask;
|
|
||||||
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
|
|
||||||
Integer timeSlices = Reduce(t_mask());
|
|
||||||
|
|
||||||
if (timeSlices > 0) {
|
|
||||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu, t_mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Repeat for backward direction.
|
|
||||||
t_mask() = ((coords_v[sU] >= (tmin + tshift)) &&
|
|
||||||
(coords_v[sU] <= (tmax + tshift)));
|
|
||||||
|
|
||||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
|
||||||
unsigned int t0 = 0;
|
|
||||||
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
|
|
||||||
|
|
||||||
timeSlices = Reduce(t_mask());
|
|
||||||
|
|
||||||
if (timeSlices > 0) {
|
|
||||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu, t_mask);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user